Compare commits
23 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
444a7d16cc | ||
|
|
fd722692a4 | ||
|
|
99cece524c | ||
|
|
c27449c60e | ||
|
|
5ef879e307 | ||
|
|
e7df63bae1 | ||
|
|
17ff3811f8 | ||
|
|
fc7fe0b08e | ||
|
|
3cf75a541a | ||
|
|
1f750d3edd | ||
|
|
b2b0444131 | ||
| dbab43db90 | |||
| bcb7fe5fe9 | |||
| d21d9d191b | |||
| ef45246ea0 | |||
| 348db35119 | |||
| 1dd7f243f5 | |||
| 938e499ac2 | |||
| 964ab39656 | |||
| c2aecc6ce9 | |||
| 439b86ce59 | |||
| eb60100297 | |||
|
|
2baf3be640 |
@@ -40,6 +40,8 @@ type App struct {
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
installer installer
|
||||
// StatusDB is the unified component health store (nil if unavailable).
|
||||
StatusDB *ComponentStatusDB
|
||||
}
|
||||
|
||||
type ActionResult struct {
|
||||
@@ -80,6 +82,7 @@ type installer interface {
|
||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
LiveBootSource() platform.LiveBootSource
|
||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||
}
|
||||
|
||||
@@ -100,6 +103,10 @@ func (a *App) IsLiveMediaInRAM() bool {
|
||||
return a.installer.IsLiveMediaInRAM()
|
||||
}
|
||||
|
||||
func (a *App) LiveBootSource() platform.LiveBootSource {
|
||||
return a.installer.LiveBootSource()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||
}
|
||||
@@ -131,7 +138,7 @@ type runtimeChecker interface {
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
return &App{
|
||||
a := &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
@@ -140,19 +147,32 @@ func New(platform *platform.System) *App {
|
||||
runtime: platform,
|
||||
installer: platform,
|
||||
}
|
||||
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||
a.StatusDB = db
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
||||
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
snap, err := readAuditSnapshot(auditJSON)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
return schema.HardwareIngestRequest{}, err
|
||||
}
|
||||
collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
|
||||
return snap, nil
|
||||
}
|
||||
|
||||
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
||||
if runtimeMode == runtimeenv.ModeLiveCD {
|
||||
if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
|
||||
@@ -160,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
@@ -276,6 +296,9 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||
data = normalized
|
||||
}
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -733,6 +756,7 @@ func (a *App) HealthSummaryResult() ActionResult {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
||||
var body strings.Builder
|
||||
@@ -767,6 +791,7 @@ func (a *App) MainBanner() string {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ""
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
var lines []string
|
||||
if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
|
||||
|
||||
@@ -660,13 +660,50 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySATOverlayFiltersIgnoredLegacyDevices(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = filepath.Join(tmp, "sat")
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
raw := `{
|
||||
"collected_at": "2026-03-15T10:00:00Z",
|
||||
"hardware": {
|
||||
"board": {"serial_number": "SRV123"},
|
||||
"storage": [
|
||||
{"model": "Virtual HDisk0", "serial_number": "AAAABBBBCCCC3"},
|
||||
{"model": "PASCARI", "serial_number": "DISK1", "status": "OK"}
|
||||
],
|
||||
"pcie_devices": [
|
||||
{"device_class": "Co-processor", "model": "402xx Series QAT", "status": "OK"},
|
||||
{"device_class": "VideoController", "model": "NVIDIA H100", "status": "OK"}
|
||||
]
|
||||
}
|
||||
}`
|
||||
|
||||
got, err := ApplySATOverlay([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("ApplySATOverlay error: %v", err)
|
||||
}
|
||||
text := string(got)
|
||||
if contains(text, "Virtual HDisk0") {
|
||||
t.Fatalf("overlaid audit should drop virtual hdisk:\n%s", text)
|
||||
}
|
||||
if contains(text, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("overlaid audit should drop co-processors:\n%s", text)
|
||||
}
|
||||
if !contains(text, "PASCARI") || !contains(text, "NVIDIA H100") {
|
||||
t.Fatalf("overlaid audit should keep real devices:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
exportDir := filepath.Join(tmp, "export")
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"model":"Virtual HDisk0","serial_number":"AAAABBBBCCCC3"},{"model":"PASCARI","serial_number":"DISK1"}],"pcie_devices":[{"device_class":"Co-processor","model":"402xx Series QAT"},{"device_class":"VideoController","model":"NVIDIA H100"}]}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
@@ -698,6 +735,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
var auditJSON string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
@@ -707,6 +745,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
t.Fatalf("read tar entry: %v", err)
|
||||
}
|
||||
names = append(names, hdr.Name)
|
||||
if contains(hdr.Name, "/export/bee-audit.json") {
|
||||
body, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
t.Fatalf("read audit entry: %v", err)
|
||||
}
|
||||
auditJSON = string(body)
|
||||
}
|
||||
}
|
||||
|
||||
var foundRaw bool
|
||||
@@ -721,6 +766,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if !foundRaw {
|
||||
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||
}
|
||||
if contains(auditJSON, "Virtual HDisk0") || contains(auditJSON, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("support bundle should normalize ignored devices:\n%s", auditJSON)
|
||||
}
|
||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
@@ -734,6 +785,10 @@ func TestMainBanner(t *testing.T) {
|
||||
product := "PowerEdge R760"
|
||||
cpuModel := "Intel Xeon Gold 6430"
|
||||
memoryType := "DDR5"
|
||||
memorySerialA := "DIMM-A"
|
||||
memorySerialB := "DIMM-B"
|
||||
storageSerialA := "DISK-A"
|
||||
storageSerialB := "DISK-B"
|
||||
gpuClass := "VideoController"
|
||||
gpuModel := "NVIDIA H100"
|
||||
|
||||
@@ -749,12 +804,12 @@ func TestMainBanner(t *testing.T) {
|
||||
{Model: &cpuModel},
|
||||
},
|
||||
Memory: []schema.HardwareMemory{
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialA},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialB},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialA},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialB},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||
|
||||
266
audit/internal/app/component_status_db.go
Normal file
266
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,266 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||
// the component stays at the highest observed severity until explicitly reset.
|
||||
type ComponentStatusDB struct {
|
||||
path string
|
||||
mu sync.Mutex
|
||||
records map[string]*ComponentStatusRecord
|
||||
}
|
||||
|
||||
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||
type ComponentStatusRecord struct {
|
||||
ComponentKey string `json:"component_key"`
|
||||
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||
LastChangedAt time.Time `json:"last_changed_at"`
|
||||
ErrorSummary string `json:"error_summary,omitempty"`
|
||||
History []ComponentStatusEntry `json:"history"`
|
||||
}
|
||||
|
||||
// ComponentStatusEntry is one observation written to a component's history.
|
||||
type ComponentStatusEntry struct {
|
||||
At time.Time `json:"at"`
|
||||
Status string `json:"status"`
|
||||
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||
Detail string `json:"detail,omitempty"`
|
||||
}
|
||||
|
||||
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||
db := &ComponentStatusDB{
|
||||
path: path,
|
||||
records: make(map[string]*ComponentStatusRecord),
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
if len(data) > 0 {
|
||||
var records []ComponentStatusRecord
|
||||
if err := json.Unmarshal(data, &records); err == nil {
|
||||
for i := range records {
|
||||
db.records[records[i].ComponentKey] = &records[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// Record writes one observation for the given component key.
|
||||
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||
// OK never downgrades an existing Warning or Critical status.
|
||||
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||
if db == nil || strings.TrimSpace(key) == "" {
|
||||
return
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
rec, exists := db.records[key]
|
||||
if !exists {
|
||||
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||
db.records[key] = rec
|
||||
}
|
||||
rec.LastCheckedAt = now
|
||||
|
||||
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||
rec.History = append(rec.History, entry)
|
||||
|
||||
// Status merge: OK never downgrades Warning/Critical.
|
||||
newSev := componentSeverity(status)
|
||||
curSev := componentSeverity(rec.Status)
|
||||
if newSev > curSev {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
rec.ErrorSummary = detail
|
||||
} else if rec.Status == "" {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
}
|
||||
|
||||
_ = db.saveLocked()
|
||||
}
|
||||
|
||||
// Get returns the current record for a component key.
|
||||
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||
if db == nil {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
r, ok := db.records[key]
|
||||
if !ok {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
return *r, true
|
||||
}
|
||||
|
||||
// All returns a snapshot of all records.
|
||||
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||
if db == nil {
|
||||
return nil
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
out = append(out, *r)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (db *ComponentStatusDB) saveLocked() error {
|
||||
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
records = append(records, *r)
|
||||
}
|
||||
data, err := json.MarshalIndent(records, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(db.path, data, 0644)
|
||||
}
|
||||
|
||||
// componentSeverity returns a numeric severity so higher values win.
|
||||
func componentSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||
// and writes component status records to db for the given SAT target.
|
||||
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||
return
|
||||
}
|
||||
archivePath = extractArchivePath(archivePath)
|
||||
if archivePath == "" {
|
||||
return
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
if overall == "" {
|
||||
return
|
||||
}
|
||||
|
||||
source := "sat:" + target
|
||||
dbStatus := satStatusToDBStatus(overall)
|
||||
|
||||
// Map SAT target to component keys.
|
||||
switch target {
|
||||
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
|
||||
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||
case "memory", "memory-stress", "sat-stress":
|
||||
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "cpu", "platform-stress":
|
||||
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "storage":
|
||||
// Try to record per-device if available in summary.
|
||||
recordedAny := false
|
||||
for key, val := range kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||
recordedAny = true
|
||||
}
|
||||
if !recordedAny {
|
||||
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func satStatusToDBStatus(overall string) string {
|
||||
switch overall {
|
||||
case "OK":
|
||||
return "OK"
|
||||
case "FAILED":
|
||||
return "Warning"
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return "Unknown"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||
func ExtractArchivePath(s string) string {
|
||||
return extractArchivePath(s)
|
||||
}
|
||||
|
||||
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||
// file located in the run directory alongside archivePath.
|
||||
// Returns "" if the file cannot be read.
|
||||
func ReadSATOverallStatus(archivePath string) string {
|
||||
if strings.TrimSpace(archivePath) == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
}
|
||||
|
||||
func extractArchivePath(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if strings.HasSuffix(s, ".tar.gz") {
|
||||
parts := strings.Fields(s)
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func parseSATKV(raw string) map[string]string {
|
||||
kv := make(map[string]string)
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||
if ok {
|
||||
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return kv
|
||||
}
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||
return
|
||||
}
|
||||
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||
applyStorageSAT(snap.Storage, summary)
|
||||
}
|
||||
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||
applyComponentStatusDB(snap, db)
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||
if snap == nil || db == nil {
|
||||
return
|
||||
}
|
||||
for _, rec := range db.All() {
|
||||
key := rec.ComponentKey
|
||||
status := dbStatusToSATStatus(rec.Status)
|
||||
if status == "" {
|
||||
continue
|
||||
}
|
||||
detail := rec.ErrorSummary
|
||||
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(key, "pcie:"):
|
||||
bdf := strings.TrimPrefix(key, "pcie:")
|
||||
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||
if sanitizeBDFForLookup(bdf) == "" {
|
||||
break
|
||||
}
|
||||
normalized := sanitizeBDFForLookup(bdf)
|
||||
for i := range snap.PCIeDevices {
|
||||
if snap.PCIeDevices[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "storage:"):
|
||||
devName := strings.TrimPrefix(key, "storage:")
|
||||
if devName == "all" {
|
||||
for i := range snap.Storage {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
} else {
|
||||
for i := range snap.Storage {
|
||||
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "memory:"):
|
||||
for i := range snap.Memory {
|
||||
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
case strings.HasPrefix(key, "cpu:"):
|
||||
for i := range snap.CPUs {
|
||||
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||
func dbStatusToSATStatus(s string) string {
|
||||
switch strings.TrimSpace(s) {
|
||||
case "OK", "Warning", "Critical", "Unknown":
|
||||
return s
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||
func sanitizeBDFForLookup(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||
return ""
|
||||
}
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
bdf = "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func ptrString(v *string) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
|
||||
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
}},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
|
||||
@@ -27,13 +27,35 @@ var supportBundleCommands = []struct {
|
||||
cmd []string
|
||||
}{
|
||||
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
||||
{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
|
||||
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
dev=$(basename "$d")
|
||||
echo "=== $dev ==="
|
||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
done
|
||||
done
|
||||
`}},
|
||||
}
|
||||
|
||||
var supportBundleOptionalFiles = []struct {
|
||||
name string
|
||||
src string
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "bee-support-*.tar.gz"
|
||||
@@ -77,6 +99,9 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
|
||||
}
|
||||
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -184,6 +209,24 @@ func writeCommandOutput(dst string, cmd []string) error {
|
||||
return os.WriteFile(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func copyOptionalFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
_, err = io.Copy(out, in)
|
||||
return err
|
||||
}
|
||||
|
||||
func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
@@ -247,7 +290,7 @@ func copyDirContents(srcDir, dstDir string) error {
|
||||
}
|
||||
|
||||
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
if err := copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||
if cleanRel == "" {
|
||||
return true
|
||||
@@ -259,7 +302,25 @@ func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
return normalizeSupportBundleAuditJSON(filepath.Join(dstDir, "bee-audit.json"))
|
||||
}
|
||||
|
||||
func normalizeSupportBundleAuditJSON(path string) error {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
normalized, err := ApplySATOverlay(data)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return os.WriteFile(path, normalized, 0644)
|
||||
}
|
||||
|
||||
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
package collector
|
||||
|
||||
import "bee/audit/internal/schema"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func NormalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
finalizeSnapshot(snap, collectedAt)
|
||||
}
|
||||
|
||||
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
snap.Memory = filterMemory(snap.Memory)
|
||||
snap.Storage = filterStorage(snap.Storage)
|
||||
snap.PCIeDevices = filterPCIe(snap.PCIeDevices)
|
||||
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||
|
||||
setComponentStatusMetadata(snap, collectedAt)
|
||||
@@ -33,11 +41,25 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
if disk.Model != nil && isVirtualHDiskModel(*disk.Model) {
|
||||
continue
|
||||
}
|
||||
out = append(out, disk)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPCIe(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
out := make([]schema.HardwarePCIeDevice, 0, len(devs))
|
||||
for _, dev := range devs {
|
||||
if dev.DeviceClass != nil && strings.Contains(strings.ToLower(strings.TrimSpace(*dev.DeviceClass)), "co-processor") {
|
||||
continue
|
||||
}
|
||||
out = append(out, dev)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||
for _, psu := range psus {
|
||||
|
||||
@@ -10,6 +10,10 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
present := true
|
||||
status := statusOK
|
||||
serial := "SN-1"
|
||||
virtualModel := "Virtual HDisk1"
|
||||
realModel := "PASCARI"
|
||||
coProcessorClass := "Co-processor"
|
||||
gpuClass := "VideoController"
|
||||
|
||||
snap := schema.HardwareSnapshot{
|
||||
Memory: []schema.HardwareMemory{
|
||||
@@ -17,9 +21,15 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Model: &virtualModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{Model: &realModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &coProcessorClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{DeviceClass: &gpuClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PowerSupplies: []schema.HardwarePowerSupply{
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
@@ -31,9 +41,12 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
||||
}
|
||||
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
if len(snap.Storage) != 2 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
||||
}
|
||||
if len(snap.PCIeDevices) != 1 || snap.PCIeDevices[0].DeviceClass == nil || *snap.PCIeDevices[0].DeviceClass != gpuClass {
|
||||
t.Fatalf("pcie finalize mismatch: %+v", snap.PCIeDevices)
|
||||
}
|
||||
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
||||
}
|
||||
|
||||
@@ -13,14 +13,18 @@ import (
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
PCIeLinkGenCurrent *int
|
||||
PCIeLinkGenMax *int
|
||||
PCIeLinkWidthCur *int
|
||||
PCIeLinkWidthMax *int
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 9 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
||||
if len(rec) < 13 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
}
|
||||
|
||||
info := nvidiaGPUInfo{
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseMaybeInt(v string) *int {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||
return nil
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &n
|
||||
}
|
||||
|
||||
func pcieLinkGenLabel(gen int) string {
|
||||
return fmt.Sprintf("Gen%d", gen)
|
||||
}
|
||||
|
||||
func parseMaybeBool(v string) *bool {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
switch v {
|
||||
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if info.HWSlowdown != nil {
|
||||
dev.HWSlowdown = info.HWSlowdown
|
||||
}
|
||||
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||
// knows the negotiated speed regardless of the current power state.
|
||||
if info.PCIeLinkGenCurrent != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||
dev.LinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkGenMax != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||
dev.MaxLinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkWidthCur != nil {
|
||||
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||
}
|
||||
if info.PCIeLinkWidthMax != nil {
|
||||
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse failed: %v", err)
|
||||
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||
}
|
||||
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||
}
|
||||
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCIeBDF(t *testing.T) {
|
||||
|
||||
@@ -59,6 +59,7 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
"host bridge",
|
||||
"isa bridge",
|
||||
"pci bridge",
|
||||
"co-processor",
|
||||
"performance counter",
|
||||
"performance counters",
|
||||
"ram memory",
|
||||
|
||||
@@ -19,6 +19,7 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
{name: "audio", class: "Audio device", want: false},
|
||||
{name: "host bridge", class: "Host bridge", want: false},
|
||||
{name: "pci bridge", class: "PCI bridge", want: false},
|
||||
{name: "co-processor", class: "Co-processor", want: false},
|
||||
{name: "smbus", class: "SMBus", want: false},
|
||||
{name: "perf", class: "Performance counters", want: false},
|
||||
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
||||
@@ -76,6 +77,20 @@ func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersCoProcessors(t *testing.T) {
|
||||
input := "" +
|
||||
"Slot:\t0000:01:00.0\nClass:\tCo-processor\nVendor:\tIntel Corporation\nDevice:\t402xx Series QAT\n\n" +
|
||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].Model == nil || *devs[0].Model != "H100" {
|
||||
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
|
||||
@@ -77,11 +77,28 @@ func discoverStorageDevices() []lsblkDevice {
|
||||
if dev.Type != "disk" {
|
||||
continue
|
||||
}
|
||||
if isVirtualBMCDisk(dev) {
|
||||
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
|
||||
continue
|
||||
}
|
||||
disks = append(disks, dev)
|
||||
}
|
||||
return disks
|
||||
}
|
||||
|
||||
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
|
||||
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
|
||||
// These have zero reported size, a generic fake serial, and a model name that
|
||||
// starts with "Virtual HDisk".
|
||||
func isVirtualBMCDisk(dev lsblkDevice) bool {
|
||||
return isVirtualHDiskModel(dev.Model)
|
||||
}
|
||||
|
||||
func isVirtualHDiskModel(model string) bool {
|
||||
model = strings.ToLower(strings.TrimSpace(model))
|
||||
return strings.HasPrefix(model, "virtual hdisk")
|
||||
}
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
|
||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package platform
|
||||
|
||||
import "regexp"
|
||||
|
||||
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||
type ErrorPattern struct {
|
||||
// Name is a short machine-readable label for logging and deduplication.
|
||||
Name string
|
||||
// Re is the compiled regular expression matched against a single kmsg line.
|
||||
Re *regexp.Regexp
|
||||
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||
Category string
|
||||
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||
Severity string
|
||||
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||
BDFGroup int
|
||||
// DevGroup is the capture group index (1-based) that contains a device name
|
||||
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||
DevGroup int
|
||||
}
|
||||
|
||||
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||
var HardwareErrorPatterns = []ErrorPattern{
|
||||
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "nvidia-rminitadapter",
|
||||
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-msi-fail",
|
||||
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-xid",
|
||||
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-uncorrectable",
|
||||
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-link-down",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "blk-io-error",
|
||||
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvme-timeout",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "scsi-failed",
|
||||
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvme-reset",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
|
||||
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||
{
|
||||
Name: "mce-hardware-error",
|
||||
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "mce-corrected",
|
||||
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
|
||||
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "edac-ue",
|
||||
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "edac-ce",
|
||||
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
}
|
||||
|
||||
func mustPat(s string) *regexp.Regexp {
|
||||
return regexp.MustCompile(s)
|
||||
}
|
||||
@@ -11,10 +11,10 @@ import (
|
||||
|
||||
// InstallDisk describes a candidate disk for installation.
|
||||
type InstallDisk struct {
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
@@ -117,6 +117,61 @@ func findLiveBootDevice() string {
|
||||
return "/dev/" + strings.TrimSpace(string(out2))
|
||||
}
|
||||
|
||||
func mountSource(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func mountFSType(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceType(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceTransport(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||
switch {
|
||||
case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
|
||||
return "ram"
|
||||
case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
|
||||
return "cdrom"
|
||||
case strings.EqualFold(strings.TrimSpace(transport), "usb"):
|
||||
return "usb"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
|
||||
return "cdrom"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
|
||||
return "disk"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// MinInstallBytes returns the minimum recommended disk size for installation:
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
|
||||
@@ -12,11 +12,40 @@ import (
|
||||
)
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output()
|
||||
if err != nil {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
if fsType == "" {
|
||||
return toramActive()
|
||||
}
|
||||
return strings.TrimSpace(string(out)) == "tmpfs"
|
||||
return strings.EqualFold(fsType, "tmpfs")
|
||||
}
|
||||
|
||||
func (s *System) LiveBootSource() LiveBootSource {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
source := mountSource("/run/live/medium")
|
||||
device := findLiveBootDevice()
|
||||
status := LiveBootSource{
|
||||
InRAM: strings.EqualFold(fsType, "tmpfs"),
|
||||
Source: source,
|
||||
Device: device,
|
||||
}
|
||||
if fsType == "" && source == "" && device == "" {
|
||||
if toramActive() {
|
||||
status.InRAM = true
|
||||
status.Kind = "ram"
|
||||
status.Source = "tmpfs"
|
||||
return status
|
||||
}
|
||||
status.Kind = "unknown"
|
||||
return status
|
||||
}
|
||||
status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
|
||||
if status.Kind == "" {
|
||||
status.Kind = "unknown"
|
||||
}
|
||||
if status.InRAM && strings.TrimSpace(status.Source) == "" {
|
||||
status.Source = "tmpfs"
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
|
||||
28
audit/internal/platform/install_to_ram_test.go
Normal file
28
audit/internal/platform/install_to_ram_test.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
fsType string
|
||||
source string
|
||||
deviceType string
|
||||
transport string
|
||||
want string
|
||||
}{
|
||||
{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
|
||||
{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
|
||||
{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
|
||||
{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
|
||||
{name: "unknown", source: "overlay", want: "unknown"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||
if got != tc.want {
|
||||
t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
64
audit/internal/platform/kill_workers.go
Normal file
64
audit/internal/platform/kill_workers.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||
// bee test worker processes that should be killed by KillTestWorkers.
|
||||
var workerPatterns = []string{
|
||||
"bee-gpu-burn",
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
}
|
||||
|
||||
// KilledProcess describes a process that was sent SIGKILL.
|
||||
type KilledProcess struct {
|
||||
PID int `json:"pid"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||
// SIGKILL to each one found. It returns a list of killed processes.
|
||||
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||
func KillTestWorkers() []KilledProcess {
|
||||
entries, err := os.ReadDir("/proc")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var killed []KilledProcess
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
pid, err := strconv.Atoi(e.Name())
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// /proc/*/cmdline uses NUL bytes as argument separators.
|
||||
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
|
||||
exe := strings.TrimSpace(args[0])
|
||||
base := exe
|
||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||
base = exe[idx+1:]
|
||||
}
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return killed
|
||||
}
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -531,6 +532,13 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
}
|
||||
|
||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
c.Cancel = func() error {
|
||||
if c.Process != nil {
|
||||
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if len(env) > 0 {
|
||||
c.Env = append(os.Environ(), env...)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,13 @@ package platform
|
||||
|
||||
type System struct{}
|
||||
|
||||
type LiveBootSource struct {
|
||||
InRAM bool `json:"in_ram"`
|
||||
Kind string `json:"kind"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
}
|
||||
|
||||
type InterfaceInfo struct {
|
||||
Name string
|
||||
State string
|
||||
|
||||
@@ -63,6 +63,10 @@ func streamJob(w http.ResponseWriter, r *http.Request, j *jobState) {
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
streamSubscribedJob(w, r, j)
|
||||
}
|
||||
|
||||
func streamSubscribedJob(w http.ResponseWriter, r *http.Request, j *jobState) {
|
||||
existing, ch := j.subscribe()
|
||||
for _, line := range existing {
|
||||
sseWrite(w, "", line)
|
||||
@@ -346,8 +350,10 @@ func (h *handler) handleAPINetworkStatus(w http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"interfaces": ifaces,
|
||||
"default_route": h.opts.App.DefaultRoute(),
|
||||
"interfaces": ifaces,
|
||||
"default_route": h.opts.App.DefaultRoute(),
|
||||
"pending_change": h.hasPendingNetworkChange(),
|
||||
"rollback_in": h.pendingNetworkRollbackIn(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -426,27 +432,6 @@ func (h *handler) handleAPIExportList(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, entries)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request) {
|
||||
if globalQueue.hasActiveTarget("support-bundle") {
|
||||
writeError(w, http.StatusConflict, "support bundle task is already pending or running")
|
||||
return
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID("support-bundle"),
|
||||
Name: "Support Bundle",
|
||||
Target: "support-bundle",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{
|
||||
"status": "queued",
|
||||
"task_id": t.ID,
|
||||
"job_id": t.ID,
|
||||
"url": "/export/support.tar.gz",
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportUSBTargets(w http.ResponseWriter, _ *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
@@ -541,9 +526,9 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
inRAM := h.opts.App.IsLiveMediaInRAM()
|
||||
status := h.opts.App.LiveBootSource()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]bool{"in_ram": inRAM})
|
||||
_ = json.NewEncoder(w).Encode(status)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -744,13 +729,7 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
|
||||
h.ringMemLoad.push(sample.MemLoadPct)
|
||||
|
||||
h.ringsMu.Lock()
|
||||
for i, fan := range sample.Fans {
|
||||
for len(h.ringFans) <= i {
|
||||
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||
h.fanNames = append(h.fanNames, fan.Name)
|
||||
}
|
||||
h.ringFans[i].push(float64(fan.RPM))
|
||||
}
|
||||
h.pushFanRings(sample.Fans)
|
||||
for _, gpu := range sample.GPUs {
|
||||
idx := gpu.GPUIndex
|
||||
for len(h.gpuRings) <= idx {
|
||||
@@ -769,6 +748,51 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
|
||||
h.ringsMu.Unlock()
|
||||
}
|
||||
|
||||
func (h *handler) pushFanRings(fans []platform.FanReading) {
|
||||
if len(fans) == 0 && len(h.ringFans) == 0 {
|
||||
return
|
||||
}
|
||||
fanValues := make(map[string]float64, len(fans))
|
||||
for _, fan := range fans {
|
||||
if fan.Name == "" {
|
||||
continue
|
||||
}
|
||||
fanValues[fan.Name] = fan.RPM
|
||||
found := false
|
||||
for i, name := range h.fanNames {
|
||||
if name == fan.Name {
|
||||
found = true
|
||||
if i >= len(h.ringFans) {
|
||||
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
h.fanNames = append(h.fanNames, fan.Name)
|
||||
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||
}
|
||||
}
|
||||
for i, ring := range h.ringFans {
|
||||
if ring == nil {
|
||||
continue
|
||||
}
|
||||
name := ""
|
||||
if i < len(h.fanNames) {
|
||||
name = h.fanNames[i]
|
||||
}
|
||||
if rpm, ok := fanValues[name]; ok {
|
||||
ring.push(rpm)
|
||||
continue
|
||||
}
|
||||
if last, ok := ring.latest(); ok {
|
||||
ring.push(last)
|
||||
continue
|
||||
}
|
||||
ring.push(0)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) {
|
||||
if name == "" {
|
||||
return
|
||||
@@ -847,7 +871,10 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
|
||||
return result, err
|
||||
}
|
||||
|
||||
pnc := &pendingNetChange{snapshot: snapshot}
|
||||
pnc := &pendingNetChange{
|
||||
snapshot: snapshot,
|
||||
deadline: time.Now().Add(netRollbackTimeout),
|
||||
}
|
||||
pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
|
||||
_ = h.opts.App.RestoreNetworkSnapshot(snapshot)
|
||||
h.pendingNetMu.Lock()
|
||||
@@ -864,6 +891,25 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (h *handler) hasPendingNetworkChange() bool {
|
||||
h.pendingNetMu.Lock()
|
||||
defer h.pendingNetMu.Unlock()
|
||||
return h.pendingNet != nil
|
||||
}
|
||||
|
||||
func (h *handler) pendingNetworkRollbackIn() int {
|
||||
h.pendingNetMu.Lock()
|
||||
defer h.pendingNetMu.Unlock()
|
||||
if h.pendingNet == nil {
|
||||
return 0
|
||||
}
|
||||
remaining := int(time.Until(h.pendingNet.deadline).Seconds())
|
||||
if remaining < 1 {
|
||||
return 1
|
||||
}
|
||||
return remaining
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||
@@ -64,39 +64,29 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIExportBundleQueuesTask(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
h := &handler{}
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_A", RPM: 4200},
|
||||
{Name: "FAN_B", RPM: 5100},
|
||||
})
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_B", RPM: 5200},
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
|
||||
req := httptest.NewRequest("POST", "/api/export/bundle", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIExportBundle(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
|
||||
t.Fatalf("fanNames=%v", h.fanNames)
|
||||
}
|
||||
var body map[string]string
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
aVals, _ := h.ringFans[0].snapshot()
|
||||
bVals, _ := h.ringFans[1].snapshot()
|
||||
if len(aVals) != 2 || len(bVals) != 2 {
|
||||
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
|
||||
}
|
||||
if body["task_id"] == "" {
|
||||
t.Fatalf("missing task_id in response: %v", body)
|
||||
if aVals[1] != 4200 {
|
||||
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].Target; got != "support-bundle" {
|
||||
t.Fatalf("target=%q want support-bundle", got)
|
||||
if bVals[1] != 5200 {
|
||||
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
|
||||
}
|
||||
}
|
||||
|
||||
230
audit/internal/webui/kmsg_watcher.go
Normal file
230
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,230 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||
// During an active SAT task window it records matching lines; on task finish
|
||||
// it writes Warning status records to the component status DB.
|
||||
type kmsgWatcher struct {
|
||||
mu sync.Mutex
|
||||
activeWindow *kmsgWindow
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
type kmsgWindow struct {
|
||||
taskID string
|
||||
target string
|
||||
startedAt time.Time
|
||||
seen map[kmsgEventKey]bool
|
||||
events []kmsgEvent
|
||||
}
|
||||
|
||||
type kmsgEventKey struct {
|
||||
id string // BDF or device name
|
||||
category string
|
||||
}
|
||||
|
||||
type kmsgEvent struct {
|
||||
timestamp time.Time
|
||||
raw string
|
||||
ids []string // BDF addresses or device names extracted
|
||||
category string
|
||||
}
|
||||
|
||||
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||
return &kmsgWatcher{statusDB: statusDB}
|
||||
}
|
||||
|
||||
// start launches the background kmsg reading goroutine.
|
||||
func (w *kmsgWatcher) start() {
|
||||
go w.run()
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) run() {
|
||||
f, err := os.Open("/dev/kmsg")
|
||||
if err != nil {
|
||||
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// Best-effort seek to end so we only capture events from now forward.
|
||||
_, _ = f.Seek(0, io.SeekEnd)
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
evt, ok := parseKmsgLine(line)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
w.mu.Lock()
|
||||
if w.activeWindow != nil {
|
||||
w.recordEvent(evt)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
slog.Warn("kmsg watcher stopped", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||
// Must be called with w.mu held.
|
||||
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||
if len(evt.ids) == 0 {
|
||||
// Events without a device ID (e.g. MCE) — deduplicate by category.
|
||||
key := kmsgEventKey{id: "", category: evt.category}
|
||||
if !w.activeWindow.seen[key] {
|
||||
w.activeWindow.seen[key] = true
|
||||
w.activeWindow.events = append(w.activeWindow.events, evt)
|
||||
}
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
key := kmsgEventKey{id: id, category: evt.category}
|
||||
if !w.activeWindow.seen[key] {
|
||||
w.activeWindow.seen[key] = true
|
||||
w.activeWindow.events = append(w.activeWindow.events, evt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskStarted opens a new event window for the given SAT task.
|
||||
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.activeWindow = &kmsgWindow{
|
||||
taskID: taskID,
|
||||
target: target,
|
||||
startedAt: time.Now(),
|
||||
seen: make(map[kmsgEventKey]bool),
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskFinished closes the event window and asynchronously writes status records.
|
||||
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||
w.mu.Lock()
|
||||
window := w.activeWindow
|
||||
if window != nil && window.taskID == taskID {
|
||||
w.activeWindow = nil
|
||||
}
|
||||
w.mu.Unlock()
|
||||
|
||||
if window == nil || len(window.events) == 0 {
|
||||
return
|
||||
}
|
||||
go w.flushWindow(window)
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
source := "watchdog:kmsg"
|
||||
// Collect unique component keys from events.
|
||||
seen := map[string]string{} // componentKey → first raw line
|
||||
for _, evt := range window.events {
|
||||
if len(evt.ids) == 0 {
|
||||
// MCE or un-identified error.
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
}
|
||||
}
|
||||
for key, detail := range seen {
|
||||
detail = "kernel error during " + window.target + " SAT: " + truncate(detail, 120)
|
||||
w.statusDB.Record(key, source, "Warning", detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||
msg := raw
|
||||
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||
msg = strings.TrimSpace(raw[idx+1:])
|
||||
}
|
||||
if msg == "" {
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
m := p.Re.FindStringSubmatch(msg)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
evt := kmsgEvent{
|
||||
timestamp: time.Now(),
|
||||
raw: msg,
|
||||
category: p.Category,
|
||||
}
|
||||
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||
}
|
||||
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||
}
|
||||
return evt, true
|
||||
}
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||
func normalizeBDF(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
return "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func truncate(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max] + "..."
|
||||
}
|
||||
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
@@ -120,7 +121,7 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
@@ -151,11 +152,6 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
if len(sysRows) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
// Reverse to chronological order
|
||||
for i, j := 0, len(sysRows)-1; i < j; i, j = i+1, j-1 {
|
||||
sysRows[i], sysRows[j] = sysRows[j], sysRows[i]
|
||||
}
|
||||
|
||||
// Collect min/max ts for range query
|
||||
minTS := sysRows[0].ts
|
||||
maxTS := sysRows[len(sysRows)-1].ts
|
||||
@@ -222,7 +218,9 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
}
|
||||
}
|
||||
|
||||
// Collect unique GPU indices and fan names from loaded data (preserve order)
|
||||
// Collect unique GPU indices and fan/temp names from loaded data.
|
||||
// Sort each list so that sample reconstruction is deterministic regardless
|
||||
// of Go's non-deterministic map iteration order.
|
||||
seenGPU := map[int]bool{}
|
||||
var gpuIndices []int
|
||||
for k := range gpuData {
|
||||
@@ -231,6 +229,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
gpuIndices = append(gpuIndices, k.idx)
|
||||
}
|
||||
}
|
||||
sort.Ints(gpuIndices)
|
||||
|
||||
seenFan := map[string]bool{}
|
||||
var fanNames []string
|
||||
for k := range fanData {
|
||||
@@ -239,6 +239,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
fanNames = append(fanNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(fanNames)
|
||||
|
||||
seenTemp := map[string]bool{}
|
||||
var tempNames []string
|
||||
for k := range tempData {
|
||||
@@ -247,6 +249,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
tempNames = append(tempNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(tempNames)
|
||||
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
|
||||
69
audit/internal/webui/metricsdb_test.go
Normal file
69
audit/internal/webui/metricsdb_test.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
base := time.Unix(1_700_000_000, 0).UTC()
|
||||
for i := 0; i < 3; i++ {
|
||||
err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base.Add(time.Duration(i) * time.Second),
|
||||
CPULoadPct: float64(10 + i),
|
||||
MemLoadPct: float64(20 + i),
|
||||
PowerW: float64(300 + i),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, PowerW: float64(100 + i)},
|
||||
{GPUIndex: 2, PowerW: float64(200 + i)},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Write(%d): %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
all, err := db.LoadAll()
|
||||
if err != nil {
|
||||
t.Fatalf("LoadAll: %v", err)
|
||||
}
|
||||
if len(all) != 3 {
|
||||
t.Fatalf("LoadAll len=%d want 3", len(all))
|
||||
}
|
||||
for i, sample := range all {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
|
||||
}
|
||||
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
|
||||
}
|
||||
}
|
||||
|
||||
recent, err := db.LoadRecent(2)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadRecent: %v", err)
|
||||
}
|
||||
if len(recent) != 2 {
|
||||
t.Fatalf("LoadRecent len=%d want 2", len(recent))
|
||||
}
|
||||
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
|
||||
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
|
||||
}
|
||||
for i, sample := range recent {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -522,13 +522,30 @@ func renderMetrics() string {
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const chartIds = [
|
||||
'chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
|
||||
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'
|
||||
];
|
||||
|
||||
function refreshChartImage(el) {
|
||||
if (!el || el.dataset.loading === '1') return;
|
||||
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
|
||||
const nextSrc = baseSrc + '?t=' + Date.now();
|
||||
const probe = new Image();
|
||||
el.dataset.baseSrc = baseSrc;
|
||||
el.dataset.loading = '1';
|
||||
probe.onload = function() {
|
||||
el.src = nextSrc;
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.onerror = function() {
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.src = nextSrc;
|
||||
}
|
||||
|
||||
function refreshCharts() {
|
||||
const t = '?t=' + Date.now();
|
||||
['chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
|
||||
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'].forEach(id => {
|
||||
const el = document.getElementById(id);
|
||||
if (el) el.src = el.src.split('?')[0] + t;
|
||||
});
|
||||
chartIds.forEach(id => refreshChartImage(document.getElementById(id)));
|
||||
}
|
||||
setInterval(refreshCharts, 3000);
|
||||
|
||||
@@ -892,6 +909,8 @@ func renderNetworkInline() string {
|
||||
</div>
|
||||
<script>
|
||||
var _netCountdownTimer = null;
|
||||
var _netRefreshTimer = null;
|
||||
const NET_ROLLBACK_SECS = 60;
|
||||
function loadNetwork() {
|
||||
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||
const rows = (d.interfaces||[]).map(i =>
|
||||
@@ -902,21 +921,33 @@ function loadNetwork() {
|
||||
document.getElementById('iface-table').innerHTML =
|
||||
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||
});
|
||||
if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
else hideNetPending();
|
||||
}).catch(function() {});
|
||||
}
|
||||
function selectIface(iface) {
|
||||
document.getElementById('dhcp-iface').value = iface;
|
||||
document.getElementById('st-iface').value = iface;
|
||||
}
|
||||
function toggleIface(iface, currentState) {
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
if (d.error) { alert('Error: '+d.error); return; }
|
||||
if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
|
||||
loadNetwork();
|
||||
showNetPending(d.rollback_in || 60);
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function hideNetPending() {
|
||||
const el = document.getElementById('net-pending');
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
_netCountdownTimer = null;
|
||||
el.style.display = 'none';
|
||||
}
|
||||
function showNetPending(secs) {
|
||||
if (!secs || secs < 1) { hideNetPending(); return; }
|
||||
const el = document.getElementById('net-pending');
|
||||
el.style.display = 'block';
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
@@ -925,30 +956,33 @@ function showNetPending(secs) {
|
||||
_netCountdownTimer = setInterval(function() {
|
||||
remaining--;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
if (remaining <= 0) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; el.style.display='none'; loadNetwork(); }
|
||||
if (remaining <= 0) { hideNetPending(); loadNetwork(); }
|
||||
}, 1000);
|
||||
}
|
||||
function confirmNetChange() {
|
||||
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; }
|
||||
document.getElementById('net-pending').style.display='none';
|
||||
fetch('/api/network/confirm',{method:'POST'});
|
||||
hideNetPending();
|
||||
fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function rollbackNetChange() {
|
||||
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; }
|
||||
document.getElementById('net-pending').style.display='none';
|
||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork());
|
||||
hideNetPending();
|
||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function runDHCP() {
|
||||
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||
if (!d.error) showNetPending(d.rollback_in || 60);
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function setStatic() {
|
||||
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||
interface: document.getElementById('st-iface').value,
|
||||
address: document.getElementById('st-addr').value,
|
||||
@@ -957,11 +991,16 @@ function setStatic() {
|
||||
dns: dns,
|
||||
})}).then(r=>r.json()).then(d => {
|
||||
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||
if (!d.error) showNetPending(d.rollback_in || 60);
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
loadNetwork();
|
||||
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
|
||||
_netRefreshTimer = setInterval(loadNetwork, 5000);
|
||||
</script>`
|
||||
}
|
||||
|
||||
@@ -974,7 +1013,7 @@ func renderNetwork() string {
|
||||
// ── Services ──────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderServicesInline() string {
|
||||
return `<div style="display:flex;justify-content:flex-end;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
return `<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="svc-out" style="display:none;margin-top:8px" class="card">
|
||||
<div class="card-head">Output</div>
|
||||
@@ -1015,6 +1054,9 @@ function svcAction(name, action) {
|
||||
setTimeout(loadServices, 1000);
|
||||
});
|
||||
}
|
||||
function restartGPUDrivers() {
|
||||
svcAction('bee-nvidia', 'restart');
|
||||
}
|
||||
loadServices();
|
||||
</script>`
|
||||
}
|
||||
@@ -1139,73 +1181,46 @@ func listExportFiles(exportDir string) ([]string, error) {
|
||||
}
|
||||
|
||||
func renderSupportBundleInline() string {
|
||||
return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleBuild()">Build Support Bundle</button>
|
||||
<a id="support-bundle-download" class="btn btn-secondary" href="/export/support.tar.gz" style="display:none">↓ Download Support Bundle</a>
|
||||
<div id="support-bundle-status" style="margin-top:12px;font-size:13px;color:var(--muted)">No support bundle built in this session.</div>
|
||||
<div id="support-bundle-log" class="terminal" style="display:none;margin-top:12px;max-height:260px"></div>
|
||||
return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">↓ Download Support Bundle</button>
|
||||
<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
|
||||
<script>
|
||||
(function(){
|
||||
var _supportBundleES = null;
|
||||
window.supportBundleBuild = function() {
|
||||
window.supportBundleDownload = function() {
|
||||
var btn = document.getElementById('support-bundle-btn');
|
||||
var status = document.getElementById('support-bundle-status');
|
||||
var log = document.getElementById('support-bundle-log');
|
||||
var download = document.getElementById('support-bundle-download');
|
||||
if (_supportBundleES) {
|
||||
_supportBundleES.close();
|
||||
_supportBundleES = null;
|
||||
}
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Building...';
|
||||
status.textContent = 'Queueing support bundle task...';
|
||||
status.textContent = 'Collecting logs and export data\u2026';
|
||||
status.style.color = 'var(--muted)';
|
||||
log.style.display = '';
|
||||
log.textContent = '';
|
||||
download.style.display = 'none';
|
||||
|
||||
fetch('/api/export/bundle', {method:'POST'}).then(function(r){
|
||||
return r.json().then(function(j){
|
||||
if (!r.ok) throw new Error(j.error || r.statusText);
|
||||
return j;
|
||||
});
|
||||
}).then(function(data){
|
||||
if (!data.task_id) throw new Error('missing task id');
|
||||
status.textContent = 'Building support bundle...';
|
||||
_supportBundleES = new EventSource('/api/tasks/' + data.task_id + '/stream');
|
||||
_supportBundleES.onmessage = function(e) {
|
||||
log.textContent += e.data + '\n';
|
||||
log.scrollTop = log.scrollHeight;
|
||||
};
|
||||
_supportBundleES.addEventListener('done', function(e) {
|
||||
_supportBundleES.close();
|
||||
_supportBundleES = null;
|
||||
btn.disabled = false;
|
||||
btn.textContent = 'Build Support Bundle';
|
||||
if (e.data) {
|
||||
status.textContent = 'Error: ' + e.data;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
return;
|
||||
}
|
||||
status.textContent = 'Support bundle ready.';
|
||||
var filename = 'bee-support.tar.gz';
|
||||
fetch('/export/support.tar.gz')
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error('HTTP ' + r.status);
|
||||
var cd = r.headers.get('Content-Disposition') || '';
|
||||
var m = cd.match(/filename="?([^";]+)"?/);
|
||||
if (m) filename = m[1];
|
||||
return r.blob();
|
||||
})
|
||||
.then(function(blob) {
|
||||
var url = URL.createObjectURL(blob);
|
||||
var a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = filename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
status.textContent = 'Download started.';
|
||||
status.style.color = 'var(--ok-fg)';
|
||||
download.style.display = '';
|
||||
});
|
||||
_supportBundleES.onerror = function() {
|
||||
if (_supportBundleES) _supportBundleES.close();
|
||||
_supportBundleES = null;
|
||||
btn.disabled = false;
|
||||
btn.textContent = 'Build Support Bundle';
|
||||
status.textContent = 'Support bundle stream disconnected.';
|
||||
})
|
||||
.catch(function(e) {
|
||||
status.textContent = 'Error: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
};
|
||||
}).catch(function(e){
|
||||
btn.disabled = false;
|
||||
btn.textContent = 'Build Support Bundle';
|
||||
status.textContent = 'Error: ' + e;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
});
|
||||
})
|
||||
.finally(function() {
|
||||
btn.disabled = false;
|
||||
btn.textContent = '\u2195 Download Support Bundle';
|
||||
});
|
||||
};
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
@@ -1267,6 +1282,7 @@ func renderTools() string {
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
@@ -1278,8 +1294,18 @@ func renderTools() string {
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let kind = d.kind || 'unknown';
|
||||
let label = source;
|
||||
if (kind === 'ram') label = 'RAM';
|
||||
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||
else label = source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
if (d.in_ram) {
|
||||
txt.textContent = '✓ Running from RAM — installation media can be safely disconnected.';
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
@@ -1538,31 +1564,47 @@ func renderInstall() string {
|
||||
// ── Tasks ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderTasks() string {
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
|
||||
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">Tasks run one at a time. Logs persist after navigation.</span>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
|
||||
</div>
|
||||
<div id="task-log-section" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Logs — <span id="task-log-title"></span>
|
||||
<button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">✕</button>
|
||||
<div id="task-log-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.58);z-index:120;align-items:center;justify-content:center;padding:16px">
|
||||
<div style="background:#fff;border-radius:6px;box-shadow:0 24px 60px rgba(0,0,0,.35);width:calc(100vw - 32px);max-width:1600px;height:calc(100vh - 32px);display:flex;flex-direction:column;overflow:hidden;position:relative">
|
||||
<div class="card-head" style="padding:14px 18px;font-size:14px">Logs — <span id="task-log-title"></span>
|
||||
<button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">✕</button>
|
||||
</div>
|
||||
<div class="card-body" style="padding:16px;flex:1;min-height:0"><div id="task-log-terminal" class="terminal" style="height:100%;max-height:none"></div></div>
|
||||
</div>
|
||||
<div class="card-body"><div id="task-log-terminal" class="terminal" style="max-height:500px"></div></div>
|
||||
</div>
|
||||
<script>
|
||||
var _taskLogES = null;
|
||||
var _taskRefreshTimer = null;
|
||||
var _tasksAll = [];
|
||||
var _taskPage = 1;
|
||||
var _taskPageSize = 50;
|
||||
var _taskLogID = '';
|
||||
|
||||
function loadTasks() {
|
||||
fetch('/api/tasks').then(r=>r.json()).then(tasks => {
|
||||
if (!tasks || tasks.length === 0) {
|
||||
_tasksAll = Array.isArray(tasks) ? tasks : [];
|
||||
if (_tasksAll.length === 0) {
|
||||
_taskPage = 1;
|
||||
document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
|
||||
syncTaskLogFromHash();
|
||||
return;
|
||||
}
|
||||
const rows = tasks.map(t => {
|
||||
const dur = t.started_at ? formatDur(t.started_at, t.done_at) : '';
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
if (_taskPage > totalPages) _taskPage = totalPages;
|
||||
if (_taskPage < 1) _taskPage = 1;
|
||||
const start = (_taskPage - 1) * _taskPageSize;
|
||||
const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
|
||||
const rows = pageTasks.map(t => {
|
||||
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||
let actions = '<button class="btn btn-sm btn-secondary" onclick="viewLog(\''+t.id+'\',\''+escHtml(t.name)+'\')">Logs</button>';
|
||||
@@ -1580,21 +1622,35 @@ function loadTasks() {
|
||||
'<td>'+t.priority+'</td>' +
|
||||
'<td>'+actions+'</td></tr>';
|
||||
}).join('');
|
||||
const showingFrom = start + 1;
|
||||
const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
|
||||
const pager =
|
||||
'<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
|
||||
'<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
|
||||
'<div style="display:flex;align-items:center;gap:8px">' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
|
||||
'<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
|
||||
'</div>' +
|
||||
'</div>';
|
||||
document.getElementById('tasks-table').innerHTML =
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>';
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
|
||||
syncTaskLogFromHash();
|
||||
});
|
||||
}
|
||||
|
||||
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||
function formatDur(start, end) {
|
||||
try {
|
||||
const s = new Date(start), e = end ? new Date(end) : new Date();
|
||||
const sec = Math.round((e-s)/1000);
|
||||
if (sec < 60) return sec+'s';
|
||||
const m = Math.floor(sec/60), ss = sec%60;
|
||||
return m+'m '+ss+'s';
|
||||
} catch(e){ return ''; }
|
||||
function formatDurSec(sec) {
|
||||
sec = Math.max(0, Math.round(sec||0));
|
||||
if (sec < 60) return sec+'s';
|
||||
const m = Math.floor(sec/60), ss = sec%60;
|
||||
return m+'m '+ss+'s';
|
||||
}
|
||||
function setTaskPage(page) {
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
_taskPage = Math.min(totalPages, Math.max(1, page));
|
||||
loadTasks();
|
||||
}
|
||||
|
||||
function cancelTask(id) {
|
||||
@@ -1603,28 +1659,78 @@ function cancelTask(id) {
|
||||
function cancelAll() {
|
||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function killWorkers() {
|
||||
if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
|
||||
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||
.then(r=>r.json())
|
||||
.then(d=>{
|
||||
loadTasks();
|
||||
var toast = document.getElementById('kill-toast');
|
||||
var parts = [];
|
||||
if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
|
||||
if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
|
||||
toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
|
||||
toast.style.display = '';
|
||||
setTimeout(()=>{ toast.style.display='none'; }, 5000);
|
||||
});
|
||||
}
|
||||
function setPriority(id, delta) {
|
||||
fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
|
||||
.then(()=>loadTasks());
|
||||
}
|
||||
function resetTaskLog(term, text) {
|
||||
term.textContent = text ? text + '\n' : '';
|
||||
if (text) term.dataset.placeholder = '1';
|
||||
else delete term.dataset.placeholder;
|
||||
}
|
||||
function prependTaskLogLine(term, line) {
|
||||
if (term.dataset.placeholder === '1') {
|
||||
term.textContent = '';
|
||||
delete term.dataset.placeholder;
|
||||
}
|
||||
term.prepend(document.createTextNode(line + '\n'));
|
||||
term.scrollTop = 0;
|
||||
}
|
||||
function viewLog(id, name) {
|
||||
if (_taskLogES) { _taskLogES.close(); _taskLogES = null; }
|
||||
document.getElementById('task-log-section').style.display = '';
|
||||
_taskLogID = id;
|
||||
window.location.hash = id;
|
||||
document.getElementById('task-log-overlay').style.display = 'flex';
|
||||
document.getElementById('task-log-title').textContent = name;
|
||||
const term = document.getElementById('task-log-terminal');
|
||||
term.textContent = 'Connecting...\n';
|
||||
resetTaskLog(term, 'Connecting...');
|
||||
_taskLogES = new EventSource('/api/tasks/'+id+'/stream');
|
||||
_taskLogES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
_taskLogES.onopen = () => {
|
||||
if (term.dataset.placeholder === '1') resetTaskLog(term, 'Connected. Waiting for output...');
|
||||
};
|
||||
_taskLogES.onmessage = e => { prependTaskLogLine(term, e.data); };
|
||||
_taskLogES.addEventListener('done', e => {
|
||||
_taskLogES.close(); _taskLogES=null;
|
||||
term.textContent += (e.data ? '\nERROR: '+e.data : '\nDone.')+'\n';
|
||||
prependTaskLogLine(term, e.data ? 'ERROR: '+e.data : 'Done.');
|
||||
});
|
||||
}
|
||||
function syncTaskLogFromHash() {
|
||||
const id = (window.location.hash || '').replace(/^#/, '');
|
||||
if (!id || id === _taskLogID) return;
|
||||
const task = _tasksAll.find(t => t.id === id);
|
||||
if (!task) return;
|
||||
viewLog(task.id, task.name || task.id);
|
||||
}
|
||||
function closeTaskLog() {
|
||||
if (_taskLogES) { _taskLogES.close(); _taskLogES=null; }
|
||||
document.getElementById('task-log-section').style.display='none';
|
||||
_taskLogID = '';
|
||||
if (window.location.hash) history.replaceState(null, '', '/tasks');
|
||||
document.getElementById('task-log-overlay').style.display='none';
|
||||
}
|
||||
|
||||
document.getElementById('task-log-overlay').addEventListener('click', function(e) {
|
||||
if (e.target === this) closeTaskLog();
|
||||
});
|
||||
window.addEventListener('hashchange', syncTaskLogFromHash);
|
||||
window.addEventListener('keydown', function(e) {
|
||||
if (e.key === 'Escape' && document.getElementById('task-log-overlay').style.display !== 'none') closeTaskLog();
|
||||
});
|
||||
|
||||
loadTasks();
|
||||
_taskRefreshTimer = setInterval(loadTasks, 2000);
|
||||
</script>`
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -84,6 +85,15 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
|
||||
return v, labels
|
||||
}
|
||||
|
||||
func (r *metricsRing) latest() (float64, bool) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
if len(r.vals) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return r.vals[len(r.vals)-1], true
|
||||
}
|
||||
|
||||
func timestampsSameLocalDay(times []time.Time) bool {
|
||||
if len(times) == 0 {
|
||||
return true
|
||||
@@ -118,9 +128,16 @@ type namedMetricsRing struct {
|
||||
Ring *metricsRing
|
||||
}
|
||||
|
||||
// metricsChartWindow is the number of samples kept in the live ring buffer.
|
||||
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
||||
const metricsChartWindow = 360
|
||||
|
||||
var metricsCollectInterval = 5 * time.Second
|
||||
|
||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||
type pendingNetChange struct {
|
||||
snapshot platform.NetworkSnapshot
|
||||
deadline time.Time
|
||||
timer *time.Timer
|
||||
mu sync.Mutex
|
||||
}
|
||||
@@ -147,6 +164,8 @@ type handler struct {
|
||||
// pending network change (rollback on timeout)
|
||||
pendingNet *pendingNetChange
|
||||
pendingNetMu sync.Mutex
|
||||
// kmsg hardware error watcher
|
||||
kmsg *kmsgWatcher
|
||||
}
|
||||
|
||||
// NewHandler creates the HTTP mux with all routes.
|
||||
@@ -171,7 +190,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Open metrics DB and pre-fill ring buffers from history.
|
||||
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
||||
h.metricsDB = db
|
||||
if samples, err := db.LoadRecent(120); err == nil {
|
||||
if samples, err := db.LoadRecent(metricsChartWindow); err == nil {
|
||||
for _, s := range samples {
|
||||
h.feedRings(s)
|
||||
}
|
||||
@@ -186,6 +205,13 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
}
|
||||
h.startMetricsCollector()
|
||||
|
||||
// Start kmsg hardware error watcher if the app (and its status DB) is available.
|
||||
if opts.App != nil {
|
||||
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||
h.kmsg.start()
|
||||
globalQueue.kmsgWatcher = h.kmsg
|
||||
}
|
||||
|
||||
globalQueue.startWorker(&opts)
|
||||
mux := http.NewServeMux()
|
||||
|
||||
@@ -225,6 +251,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Tasks
|
||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||
mux.HandleFunc("POST /api/tasks/cancel-all", h.handleAPITasksCancelAll)
|
||||
mux.HandleFunc("POST /api/tasks/kill-workers", h.handleAPITasksKillWorkers)
|
||||
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
|
||||
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
|
||||
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
|
||||
@@ -243,7 +270,6 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
|
||||
// Export
|
||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||
mux.HandleFunc("POST /api/export/bundle", h.handleAPIExportBundle)
|
||||
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
||||
mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
|
||||
mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
|
||||
@@ -288,15 +314,15 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
|
||||
func (h *handler) startMetricsCollector() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(1 * time.Second)
|
||||
ticker := time.NewTicker(metricsCollectInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
sample := platform.SampleLiveMetrics()
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
}
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -368,15 +394,12 @@ func (h *handler) handleRuntimeHealthJSON(w http.ResponseWriter, r *http.Request
|
||||
}
|
||||
|
||||
func (h *handler) handleSupportBundleDownload(w http.ResponseWriter, r *http.Request) {
|
||||
archive, err := app.LatestSupportBundlePath()
|
||||
archive, err := app.BuildSupportBundle(h.opts.ExportDir)
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
http.Error(w, "support bundle not built yet", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
http.Error(w, fmt.Sprintf("locate support bundle: %v", err), http.StatusInternalServerError)
|
||||
http.Error(w, fmt.Sprintf("build support bundle: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer os.Remove(archive)
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/gzip")
|
||||
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%q", filepath.Base(archive)))
|
||||
@@ -448,222 +471,13 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
||||
path = strings.TrimSuffix(path, ".svg")
|
||||
|
||||
if h.metricsDB != nil {
|
||||
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
|
||||
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
return
|
||||
}
|
||||
if h.metricsDB == nil {
|
||||
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var labels []string
|
||||
var title string
|
||||
var yMin, yMax *float64 // nil = auto; for load charts fixed 0-100
|
||||
|
||||
switch {
|
||||
// ── Server sub-charts ─────────────────────────────────────────────────
|
||||
case path == "server-load":
|
||||
title = "CPU / Memory Load"
|
||||
vCPULoad, l := h.ringCPULoad.snapshot()
|
||||
vMemLoad, _ := h.ringMemLoad.snapshot()
|
||||
labels = l
|
||||
datasets = [][]float64{vCPULoad, vMemLoad}
|
||||
names = []string{"CPU Load %", "Mem Load %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "server-temp", path == "server-temp-cpu":
|
||||
title = "CPU Temperature"
|
||||
h.ringsMu.Lock()
|
||||
datasets, names, labels = snapshotNamedRings(h.cpuTempRings)
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-temp-gpu":
|
||||
title = "GPU Temperature"
|
||||
h.ringsMu.Lock()
|
||||
for idx, gr := range h.gpuRings {
|
||||
if gr == nil {
|
||||
continue
|
||||
}
|
||||
vTemp, l := gr.Temp.snapshot()
|
||||
datasets = append(datasets, vTemp)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-temp-ambient":
|
||||
title = "Ambient / Other Sensors"
|
||||
h.ringsMu.Lock()
|
||||
datasets, names, labels = snapshotNamedRings(h.ambientTempRings)
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-power":
|
||||
title = "System Power"
|
||||
vPower, l := h.ringPower.snapshot()
|
||||
vPower = normalizePowerSeries(vPower)
|
||||
labels = l
|
||||
datasets = [][]float64{vPower}
|
||||
names = []string{"Power W"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(vPower)
|
||||
|
||||
case path == "server-fans":
|
||||
title = "Fan RPM"
|
||||
h.ringsMu.Lock()
|
||||
for i, fr := range h.ringFans {
|
||||
fv, _ := fr.snapshot()
|
||||
datasets = append(datasets, fv)
|
||||
name := "Fan"
|
||||
if i < len(h.fanNames) {
|
||||
name = h.fanNames[i]
|
||||
}
|
||||
names = append(names, name+" RPM")
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
// ── Combined GPU charts (all GPUs on one chart) ───────────────────────
|
||||
case path == "gpu-all-load":
|
||||
title = "GPU Compute Load"
|
||||
h.ringsMu.Lock()
|
||||
for idx, gr := range h.gpuRings {
|
||||
if gr == nil {
|
||||
continue
|
||||
}
|
||||
vUtil, l := gr.Util.snapshot()
|
||||
datasets = append(datasets, vUtil)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "gpu-all-memload":
|
||||
title = "GPU Memory Load"
|
||||
h.ringsMu.Lock()
|
||||
for idx, gr := range h.gpuRings {
|
||||
if gr == nil {
|
||||
continue
|
||||
}
|
||||
vMem, l := gr.MemUtil.snapshot()
|
||||
datasets = append(datasets, vMem)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "gpu-all-power":
|
||||
title = "GPU Power"
|
||||
h.ringsMu.Lock()
|
||||
for idx, gr := range h.gpuRings {
|
||||
if gr == nil {
|
||||
continue
|
||||
}
|
||||
vPow, l := gr.Power.snapshot()
|
||||
datasets = append(datasets, vPow)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "gpu-all-temp":
|
||||
title = "GPU Temperature"
|
||||
h.ringsMu.Lock()
|
||||
for idx, gr := range h.gpuRings {
|
||||
if gr == nil {
|
||||
continue
|
||||
}
|
||||
vTemp, l := gr.Temp.snapshot()
|
||||
datasets = append(datasets, vTemp)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
// ── Per-GPU sub-charts ────────────────────────────────────────────────
|
||||
case strings.HasPrefix(path, "gpu/"):
|
||||
rest := strings.TrimPrefix(path, "gpu/")
|
||||
// rest is either "{idx}-load", "{idx}-temp", "{idx}-power", or legacy "{idx}"
|
||||
sub := ""
|
||||
if i := strings.LastIndex(rest, "-"); i > 0 {
|
||||
sub = rest[i+1:]
|
||||
rest = rest[:i]
|
||||
}
|
||||
idx := 0
|
||||
fmt.Sscanf(rest, "%d", &idx)
|
||||
h.ringsMu.Lock()
|
||||
var gr *gpuRings
|
||||
if idx < len(h.gpuRings) {
|
||||
gr = h.gpuRings[idx]
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
if gr == nil {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
switch sub {
|
||||
case "load":
|
||||
vUtil, l := gr.Util.snapshot()
|
||||
vMemUtil, _ := gr.MemUtil.snapshot()
|
||||
labels = l
|
||||
title = fmt.Sprintf("GPU %d Load", idx)
|
||||
datasets = [][]float64{vUtil, vMemUtil}
|
||||
names = []string{"Load %", "Mem %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
case "temp":
|
||||
vTemp, l := gr.Temp.snapshot()
|
||||
labels = l
|
||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
||||
datasets = [][]float64{vTemp}
|
||||
names = []string{"Temp °C"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(vTemp)
|
||||
default: // "power" or legacy (no sub)
|
||||
vPower, l := gr.Power.snapshot()
|
||||
labels = l
|
||||
title = fmt.Sprintf("GPU %d Power", idx)
|
||||
datasets = [][]float64{vPower}
|
||||
names = []string{"Power W"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(vPower)
|
||||
}
|
||||
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
|
||||
if !ok {
|
||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -840,6 +654,7 @@ func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]f
|
||||
}
|
||||
}
|
||||
}
|
||||
sort.Strings(names)
|
||||
datasets := make([][]float64, 0, len(names))
|
||||
for _, name := range names {
|
||||
ds := make([]float64, len(samples))
|
||||
@@ -867,6 +682,7 @@ func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []strin
|
||||
}
|
||||
}
|
||||
}
|
||||
sort.Strings(names)
|
||||
datasets := make([][]float64, 0, len(names))
|
||||
for _, name := range names {
|
||||
ds := make([]float64, len(samples))
|
||||
@@ -878,7 +694,7 @@ func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []strin
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
datasets = append(datasets, normalizeFanSeries(ds))
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
@@ -894,6 +710,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
|
||||
}
|
||||
}
|
||||
}
|
||||
sort.Ints(indices)
|
||||
datasets := make([][]float64, 0, len(indices))
|
||||
names := make([]string, 0, len(indices))
|
||||
for _, idx := range indices {
|
||||
@@ -953,6 +770,27 @@ func normalizePowerSeries(ds []float64) []float64 {
|
||||
return out
|
||||
}
|
||||
|
||||
func normalizeFanSeries(ds []float64) []float64 {
|
||||
if len(ds) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]float64, len(ds))
|
||||
var lastPositive float64
|
||||
for i, v := range ds {
|
||||
if v > 0 {
|
||||
lastPositive = v
|
||||
out[i] = v
|
||||
continue
|
||||
}
|
||||
if lastPositive > 0 {
|
||||
out[i] = lastPositive
|
||||
continue
|
||||
}
|
||||
out[i] = 0
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// floatPtr returns a pointer to a float64 value.
|
||||
func floatPtr(v float64) *float64 { return &v }
|
||||
|
||||
@@ -1044,15 +882,17 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
||||
opt.Title = gocharts.TitleOption{Text: title}
|
||||
opt.XAxis.Labels = sparse
|
||||
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
||||
if chartLegendVisible(len(names)) {
|
||||
opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
|
||||
opt.Legend.OverlayChart = gocharts.Ptr(false)
|
||||
} else {
|
||||
opt.Legend.Show = gocharts.Ptr(false)
|
||||
}
|
||||
opt.Symbol = gocharts.SymbolNone
|
||||
// Right padding: reserve space for the MarkLine label (library recommendation).
|
||||
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
|
||||
if yMin != nil || yMax != nil {
|
||||
opt.YAxis = []gocharts.YAxisOption{{
|
||||
Min: yMin,
|
||||
Max: yMax,
|
||||
ValueFormatter: chartLegendNumber,
|
||||
}}
|
||||
opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
|
||||
}
|
||||
|
||||
// Add a single peak mark line on the series that holds the global maximum.
|
||||
@@ -1064,7 +904,7 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
||||
p := gocharts.NewPainter(gocharts.PainterOptions{
|
||||
OutputFormat: gocharts.ChartOutputSVG,
|
||||
Width: 1400,
|
||||
Height: 240,
|
||||
Height: chartCanvasHeight(len(names)),
|
||||
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
|
||||
if err := p.LineChart(opt); err != nil {
|
||||
return nil, err
|
||||
@@ -1072,6 +912,26 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
||||
return p.Bytes()
|
||||
}
|
||||
|
||||
func chartLegendVisible(seriesCount int) bool {
|
||||
return seriesCount <= 8
|
||||
}
|
||||
|
||||
func chartCanvasHeight(seriesCount int) int {
|
||||
if chartLegendVisible(seriesCount) {
|
||||
return 360
|
||||
}
|
||||
return 288
|
||||
}
|
||||
|
||||
func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
|
||||
return gocharts.YAxisOption{
|
||||
Min: yMin,
|
||||
Max: yMax,
|
||||
LabelCount: 11,
|
||||
ValueFormatter: chartYAxisNumber,
|
||||
}
|
||||
}
|
||||
|
||||
// globalPeakSeries returns the index of the series containing the global maximum
|
||||
// value across all datasets, and that maximum value.
|
||||
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
|
||||
@@ -1159,6 +1019,28 @@ func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []str
|
||||
return datasets, names, labels
|
||||
}
|
||||
|
||||
func snapshotFanRings(rings []*metricsRing, fanNames []string) ([][]float64, []string, []string) {
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var labels []string
|
||||
for i, ring := range rings {
|
||||
if ring == nil {
|
||||
continue
|
||||
}
|
||||
vals, l := ring.snapshot()
|
||||
datasets = append(datasets, normalizeFanSeries(vals))
|
||||
name := "Fan"
|
||||
if i < len(fanNames) {
|
||||
name = fanNames[i]
|
||||
}
|
||||
names = append(names, name+" RPM")
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
return datasets, names, labels
|
||||
}
|
||||
|
||||
func chartLegendNumber(v float64) string {
|
||||
neg := v < 0
|
||||
if v < 0 {
|
||||
@@ -1181,6 +1063,30 @@ func chartLegendNumber(v float64) string {
|
||||
return out
|
||||
}
|
||||
|
||||
func chartYAxisNumber(v float64) string {
|
||||
neg := v < 0
|
||||
if neg {
|
||||
v = -v
|
||||
}
|
||||
var out string
|
||||
switch {
|
||||
case v >= 10000:
|
||||
out = fmt.Sprintf("%dк", int((v+500)/1000))
|
||||
case v >= 1000:
|
||||
// Use one decimal place so ticks like 1400, 1600, 1800 read as
|
||||
// "1,4к", "1,6к", "1,8к" instead of the ambiguous "1к"/"2к".
|
||||
s := fmt.Sprintf("%.1f", v/1000)
|
||||
s = strings.TrimRight(strings.TrimRight(s, "0"), ".")
|
||||
out = strings.ReplaceAll(s, ".", ",") + "к"
|
||||
default:
|
||||
out = fmt.Sprintf("%.0f", v)
|
||||
}
|
||||
if neg {
|
||||
return "-" + out
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func sparseLabels(labels []string, n int) []string {
|
||||
out := make([]string, len(labels))
|
||||
step := len(labels) / n
|
||||
|
||||
@@ -89,6 +89,53 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 7, PowerW: 170},
|
||||
{GPUIndex: 2, PowerW: 120},
|
||||
{GPUIndex: 0, PowerW: 100},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, PowerW: 101},
|
||||
{GPUIndex: 7, PowerW: 171},
|
||||
{GPUIndex: 2, PowerW: 121},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
if title != "GPU Power" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
|
||||
if len(names) != len(wantNames) {
|
||||
t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
|
||||
}
|
||||
for i := range wantNames {
|
||||
if names[i] != wantNames[i] {
|
||||
t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
|
||||
}
|
||||
}
|
||||
if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
|
||||
t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
|
||||
}
|
||||
if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
|
||||
t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
|
||||
}
|
||||
if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
|
||||
t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||
want := []float64{0, 480, 480, 480, 510, 510}
|
||||
@@ -102,6 +149,120 @@ func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
||||
body := renderMetrics()
|
||||
if !strings.Contains(body, "const probe = new Image();") {
|
||||
t.Fatalf("metrics page should preload chart images before swap: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
||||
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartLegendVisible(t *testing.T) {
|
||||
if !chartLegendVisible(8) {
|
||||
t.Fatal("legend should stay visible for charts with up to 8 series")
|
||||
}
|
||||
if chartLegendVisible(9) {
|
||||
t.Fatal("legend should be hidden for charts with more than 8 series")
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartYAxisNumber(t *testing.T) {
|
||||
tests := []struct {
|
||||
in float64
|
||||
want string
|
||||
}{
|
||||
{in: 999, want: "999"},
|
||||
{in: 1000, want: "1к"},
|
||||
{in: 1370, want: "1,4к"},
|
||||
{in: 1500, want: "1,5к"},
|
||||
{in: 1700, want: "1,7к"},
|
||||
{in: 2000, want: "2к"},
|
||||
{in: 9999, want: "10к"},
|
||||
{in: 10200, want: "10к"},
|
||||
{in: -1500, want: "-1,5к"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := chartYAxisNumber(tc.in); got != tc.want {
|
||||
t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartCanvasHeight(t *testing.T) {
|
||||
if got := chartCanvasHeight(4); got != 360 {
|
||||
t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
|
||||
}
|
||||
if got := chartCanvasHeight(12); got != 288 {
|
||||
t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartYAxisOption(t *testing.T) {
|
||||
min := floatPtr(0)
|
||||
max := floatPtr(100)
|
||||
opt := chartYAxisOption(min, max)
|
||||
if opt.Min != min || opt.Max != max {
|
||||
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
|
||||
}
|
||||
if opt.LabelCount != 11 {
|
||||
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
|
||||
}
|
||||
if got := opt.ValueFormatter(1000); got != "1к" {
|
||||
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
||||
r1 := newMetricsRing(4)
|
||||
r2 := newMetricsRing(4)
|
||||
r1.push(1000)
|
||||
r1.push(1100)
|
||||
r2.push(1200)
|
||||
r2.push(1300)
|
||||
|
||||
datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
|
||||
if len(datasets) != 2 {
|
||||
t.Fatalf("datasets=%d want 2", len(datasets))
|
||||
}
|
||||
if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
if len(labels) != 2 {
|
||||
t.Fatalf("labels=%v want 2 entries", labels)
|
||||
}
|
||||
if labels[0] == "" || labels[1] == "" {
|
||||
t.Fatalf("labels should contain timeline values, got %v", labels)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
|
||||
body := renderNetworkInline()
|
||||
if !strings.Contains(body, "d.pending_change") {
|
||||
t.Fatalf("network UI should read pending network state from API: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
|
||||
t.Fatalf("network UI should periodically refresh network state: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
|
||||
t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootRendersDashboard(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -198,6 +359,44 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `id="task-log-overlay"`) {
|
||||
t.Fatalf("tasks page missing log modal overlay: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `_taskPageSize = 50`) {
|
||||
t.Fatalf("tasks page missing pagination size config: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Previous</button>`) || !strings.Contains(body, `Next</button>`) {
|
||||
t.Fatalf("tasks page missing pagination controls: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
||||
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
|
||||
@@ -83,16 +83,17 @@ func taskDisplayName(target, profile, loader string) string {
|
||||
|
||||
// Task represents one unit of work in the queue.
|
||||
type Task struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ElapsedSec int `json:"elapsed_sec,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
|
||||
// runtime fields (not serialised)
|
||||
job *jobState
|
||||
@@ -101,11 +102,11 @@ type Task struct {
|
||||
|
||||
// taskParams holds optional parameters parsed from the run request.
|
||||
type taskParams struct {
|
||||
Duration int `json:"duration,omitempty"`
|
||||
DiagLevel int `json:"diag_level,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
Duration int `json:"duration,omitempty"`
|
||||
DiagLevel int `json:"diag_level,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
@@ -172,13 +173,14 @@ func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions
|
||||
|
||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||
type taskQueue struct {
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
statePath string
|
||||
logsDir string
|
||||
started bool
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
statePath string
|
||||
logsDir string
|
||||
started bool
|
||||
kmsgWatcher *kmsgWatcher
|
||||
}
|
||||
|
||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||
@@ -290,6 +292,30 @@ func (q *taskQueue) findJob(id string) (*jobState, bool) {
|
||||
return t.job, true
|
||||
}
|
||||
|
||||
type taskStreamSource struct {
|
||||
status string
|
||||
errMsg string
|
||||
logPath string
|
||||
job *jobState
|
||||
}
|
||||
|
||||
func (q *taskQueue) taskStreamSource(id string) (taskStreamSource, bool) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
for _, t := range q.tasks {
|
||||
if t.ID != id {
|
||||
continue
|
||||
}
|
||||
return taskStreamSource{
|
||||
status: t.Status,
|
||||
errMsg: t.ErrMsg,
|
||||
logPath: t.LogPath,
|
||||
job: t.job,
|
||||
}, true
|
||||
}
|
||||
return taskStreamSource{}, false
|
||||
}
|
||||
|
||||
func (q *taskQueue) hasActiveTarget(target string) bool {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
@@ -304,15 +330,19 @@ func (q *taskQueue) hasActiveTarget(target string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// snapshot returns a copy of all tasks sorted for display (running first, then pending by priority, then done by doneAt desc).
|
||||
// snapshot returns a copy of all tasks sorted for display with newest tasks first.
|
||||
func (q *taskQueue) snapshot() []Task {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
out := make([]Task, len(q.tasks))
|
||||
for i, t := range q.tasks {
|
||||
out[i] = *t
|
||||
out[i].ElapsedSec = taskElapsedSec(&out[i], time.Now())
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
if !out[i].CreatedAt.Equal(out[j].CreatedAt) {
|
||||
return out[i].CreatedAt.After(out[j].CreatedAt)
|
||||
}
|
||||
si := statusOrder(out[i].Status)
|
||||
sj := statusOrder(out[j].Status)
|
||||
if si != sj {
|
||||
@@ -321,7 +351,7 @@ func (q *taskQueue) snapshot() []Task {
|
||||
if out[i].Priority != out[j].Priority {
|
||||
return out[i].Priority > out[j].Priority
|
||||
}
|
||||
return out[i].CreatedAt.Before(out[j].CreatedAt)
|
||||
return out[i].Name < out[j].Name
|
||||
})
|
||||
return out
|
||||
}
|
||||
@@ -382,8 +412,16 @@ func (q *taskQueue) worker() {
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||
}
|
||||
|
||||
q.runTask(t, j, ctx)
|
||||
|
||||
if q.kmsgWatcher != nil {
|
||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||
}
|
||||
|
||||
q.mu.Lock()
|
||||
now2 := time.Now()
|
||||
t.DoneAt = &now2
|
||||
@@ -589,6 +627,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// If the SAT archive was produced, check overall_status and write to component DB.
|
||||
if archive != "" {
|
||||
archivePath := app.ExtractArchivePath(archive)
|
||||
if err == nil {
|
||||
if app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||
}
|
||||
}
|
||||
if db := q.statusDB(); db != nil {
|
||||
app.ApplySATResultToDB(db, t.Target, archivePath)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
@@ -605,6 +656,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
j.finish("")
|
||||
}
|
||||
|
||||
func (q *taskQueue) statusDB() *app.ComponentStatusDB {
|
||||
if q.opts == nil || q.opts.App == nil {
|
||||
return nil
|
||||
}
|
||||
return q.opts.App.StatusDB
|
||||
}
|
||||
|
||||
func splitLines(s string) []string {
|
||||
var out []string
|
||||
for _, l := range splitNL(s) {
|
||||
@@ -714,23 +772,83 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
||||
writeJSON(w, map[string]int{"cancelled": n})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Request) {
|
||||
// Cancel all queued/running tasks in the queue first.
|
||||
globalQueue.mu.Lock()
|
||||
now := time.Now()
|
||||
cancelled := 0
|
||||
for _, t := range globalQueue.tasks {
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
cancelled++
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
cancelled++
|
||||
}
|
||||
}
|
||||
globalQueue.persistLocked()
|
||||
globalQueue.mu.Unlock()
|
||||
|
||||
// Kill orphaned test worker processes at the OS level.
|
||||
killed := platform.KillTestWorkers()
|
||||
writeJSON(w, map[string]any{
|
||||
"cancelled": cancelled,
|
||||
"killed": len(killed),
|
||||
"processes": killed,
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
// Wait up to 5s for the task to get a job (it may be pending)
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
var j *jobState
|
||||
for time.Now().Before(deadline) {
|
||||
if jj, ok := globalQueue.findJob(id); ok {
|
||||
j = jj
|
||||
break
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
}
|
||||
if j == nil {
|
||||
http.Error(w, "task not found or not yet started", http.StatusNotFound)
|
||||
src, ok := globalQueue.taskStreamSource(id)
|
||||
if !ok {
|
||||
http.Error(w, "task not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
streamJob(w, r, j)
|
||||
if src.job != nil {
|
||||
streamJob(w, r, src.job)
|
||||
return
|
||||
}
|
||||
if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
|
||||
j := newTaskJobState(src.logPath)
|
||||
j.finish(src.errMsg)
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
sseWrite(w, "", "Task is queued. Waiting for worker...")
|
||||
ticker := time.NewTicker(200 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
src, ok = globalQueue.taskStreamSource(id)
|
||||
if !ok {
|
||||
sseWrite(w, "done", "task not found")
|
||||
return
|
||||
}
|
||||
if src.job != nil {
|
||||
streamSubscribedJob(w, r, src.job)
|
||||
return
|
||||
}
|
||||
if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
|
||||
j := newTaskJobState(src.logPath)
|
||||
j.finish(src.errMsg)
|
||||
streamSubscribedJob(w, r, j)
|
||||
return
|
||||
}
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
|
||||
@@ -767,8 +885,18 @@ func (q *taskQueue) loadLocked() {
|
||||
params: pt.Params,
|
||||
}
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||
t.Status = TaskPending
|
||||
if t.Status == TaskRunning {
|
||||
// The task was interrupted by a bee-web restart. Child processes
|
||||
// (e.g. bee-gpu-burn-worker) survive the restart in their own
|
||||
// process groups and cannot be cancelled retroactively. Mark the
|
||||
// task as failed so the user can decide whether to re-run it
|
||||
// rather than blindly re-launching duplicate workers.
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
t.ErrMsg = "interrupted by bee-web restart"
|
||||
} else if t.Status == TaskPending {
|
||||
t.StartedAt = nil
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
}
|
||||
@@ -808,3 +936,21 @@ func (q *taskQueue) persistLocked() {
|
||||
}
|
||||
_ = os.Rename(tmp, q.statePath)
|
||||
}
|
||||
|
||||
func taskElapsedSec(t *Task, now time.Time) int {
|
||||
if t == nil || t.StartedAt == nil || t.StartedAt.IsZero() {
|
||||
return 0
|
||||
}
|
||||
start := *t.StartedAt
|
||||
if !t.CreatedAt.IsZero() && start.Before(t.CreatedAt) {
|
||||
start = t.CreatedAt
|
||||
}
|
||||
end := now
|
||||
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
end = *t.DoneAt
|
||||
}
|
||||
if end.Before(start) {
|
||||
return 0
|
||||
}
|
||||
return int(end.Sub(start).Round(time.Second) / time.Second)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@ package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@@ -24,21 +26,34 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
}
|
||||
|
||||
started := time.Now().Add(-time.Minute)
|
||||
task := &Task{
|
||||
ID: "task-1",
|
||||
|
||||
// A task that was pending (not yet started) must be re-queued on restart.
|
||||
pendingTask := &Task{
|
||||
ID: "task-pending",
|
||||
Name: "Memory Burn-in",
|
||||
Target: "memory-stress",
|
||||
Priority: 2,
|
||||
Status: TaskRunning,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now().Add(-2 * time.Minute),
|
||||
StartedAt: &started,
|
||||
params: taskParams{
|
||||
Duration: 300,
|
||||
BurnProfile: "smoke",
|
||||
},
|
||||
params: taskParams{Duration: 300, BurnProfile: "smoke"},
|
||||
}
|
||||
// A task that was running when bee-web crashed must NOT be re-queued —
|
||||
// its child processes (e.g. gpu-burn-worker) survive the restart in
|
||||
// their own process groups and can't be cancelled retroactively.
|
||||
runningTask := &Task{
|
||||
ID: "task-running",
|
||||
Name: "NVIDIA GPU Stress",
|
||||
Target: "nvidia-stress",
|
||||
Priority: 1,
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now().Add(-3 * time.Minute),
|
||||
StartedAt: &started,
|
||||
params: taskParams{Duration: 86400},
|
||||
}
|
||||
for _, task := range []*Task{pendingTask, runningTask} {
|
||||
q.tasks = append(q.tasks, task)
|
||||
q.assignTaskLogPathLocked(task)
|
||||
}
|
||||
q.tasks = append(q.tasks, task)
|
||||
q.assignTaskLogPathLocked(task)
|
||||
q.persistLocked()
|
||||
|
||||
recovered := &taskQueue{
|
||||
@@ -48,18 +63,47 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
}
|
||||
recovered.loadLocked()
|
||||
|
||||
if len(recovered.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(recovered.tasks))
|
||||
if len(recovered.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(recovered.tasks))
|
||||
}
|
||||
got := recovered.tasks[0]
|
||||
if got.Status != TaskPending {
|
||||
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
||||
|
||||
byID := map[string]*Task{}
|
||||
for i := range recovered.tasks {
|
||||
byID[recovered.tasks[i].ID] = recovered.tasks[i]
|
||||
}
|
||||
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
||||
t.Fatalf("params=%+v", got.params)
|
||||
|
||||
// Pending task must be re-queued as pending with params intact.
|
||||
p := byID["task-pending"]
|
||||
if p == nil {
|
||||
t.Fatal("task-pending not found")
|
||||
}
|
||||
if got.LogPath == "" {
|
||||
t.Fatal("expected log path")
|
||||
if p.Status != TaskPending {
|
||||
t.Fatalf("pending task: status=%q want %q", p.Status, TaskPending)
|
||||
}
|
||||
if p.StartedAt != nil {
|
||||
t.Fatalf("pending task: started_at=%v want nil", p.StartedAt)
|
||||
}
|
||||
if p.params.Duration != 300 || p.params.BurnProfile != "smoke" {
|
||||
t.Fatalf("pending task: params=%+v", p.params)
|
||||
}
|
||||
if p.LogPath == "" {
|
||||
t.Fatal("pending task: expected log path")
|
||||
}
|
||||
|
||||
// Running task must be marked failed, not re-queued, to prevent
|
||||
// launching duplicate workers (e.g. a second set of gpu-burn-workers).
|
||||
r := byID["task-running"]
|
||||
if r == nil {
|
||||
t.Fatal("task-running not found")
|
||||
}
|
||||
if r.Status != TaskFailed {
|
||||
t.Fatalf("running task: status=%q want %q", r.Status, TaskFailed)
|
||||
}
|
||||
if r.ErrMsg == "" {
|
||||
t.Fatal("running task: expected non-empty error message")
|
||||
}
|
||||
if r.DoneAt == nil {
|
||||
t.Fatal("running task: expected done_at to be set")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,6 +124,130 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||
q := &taskQueue{
|
||||
tasks: []*Task{
|
||||
{
|
||||
ID: "old-running",
|
||||
Name: "Old Running",
|
||||
Status: TaskRunning,
|
||||
Priority: 10,
|
||||
CreatedAt: now.Add(-3 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "new-done",
|
||||
Name: "New Done",
|
||||
Status: TaskDone,
|
||||
Priority: 0,
|
||||
CreatedAt: now.Add(-1 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "mid-pending",
|
||||
Name: "Mid Pending",
|
||||
Status: TaskPending,
|
||||
Priority: 1,
|
||||
CreatedAt: now.Add(-2 * time.Minute),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got := q.snapshot()
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("snapshot len=%d want 3", len(got))
|
||||
}
|
||||
if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
|
||||
t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
logPath := filepath.Join(dir, "task.log")
|
||||
if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "done-1",
|
||||
Name: "Done Task",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now(),
|
||||
LogPath: logPath,
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
|
||||
req.SetPathValue("id", "done-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
|
||||
t.Fatalf("body=%q", body)
|
||||
}
|
||||
if !strings.Contains(body, "event: done\n") {
|
||||
t.Fatalf("missing done event: %q", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "pending-1",
|
||||
Name: "Pending Task",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
|
||||
req.SetPathValue("id", "pending-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
|
||||
cancel()
|
||||
<-done
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
return
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
cancel()
|
||||
<-done
|
||||
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
||||
}
|
||||
|
||||
func TestResolveBurnPreset(t *testing.T) {
|
||||
tests := []struct {
|
||||
profile string
|
||||
@@ -236,6 +404,26 @@ func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
|
||||
now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
|
||||
created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
|
||||
started := time.Time{}
|
||||
task := &Task{
|
||||
Status: TaskRunning,
|
||||
CreatedAt: created,
|
||||
StartedAt: &started,
|
||||
}
|
||||
if got := taskElapsedSec(task, now); got != 0 {
|
||||
t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
|
||||
}
|
||||
|
||||
stale := created.Add(-24 * time.Hour)
|
||||
task.StartedAt = &stale
|
||||
if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
|
||||
t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{},
|
||||
|
||||
@@ -9,6 +9,34 @@ All live metrics charts in the web UI are server-side SVG images served by Go
|
||||
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
||||
There is no client-side canvas or JS chart library.
|
||||
|
||||
## Rule: live charts must be visually uniform
|
||||
|
||||
Live charts are a single UI family, not a set of one-off widgets. New charts and
|
||||
changes to existing charts must keep the same rendering model and presentation
|
||||
rules unless there is an explicit architectural decision to diverge.
|
||||
|
||||
Default expectations:
|
||||
|
||||
- same server-side SVG pipeline for all live metrics charts
|
||||
- same refresh behaviour and failure handling in the browser
|
||||
- same canvas size class and card layout
|
||||
- same legend placement policy across charts
|
||||
- same axis, title, and summary conventions
|
||||
- no chart-specific visual exceptions added as a quick fix
|
||||
|
||||
Current default for live charts:
|
||||
|
||||
- legend below the plot area when a chart has 8 series or fewer
|
||||
- legend hidden when a chart has more than 8 series
|
||||
- 10 equal Y-axis steps across the chart height
|
||||
- 1400 x 360 SVG canvas with legend
|
||||
- 1400 x 288 SVG canvas without legend
|
||||
- full-width card rendering in a single-column stack
|
||||
|
||||
If one chart needs a different layout or legend behaviour, treat that as a
|
||||
design-level decision affecting the whole chart family, not as a local tweak to
|
||||
just one endpoint.
|
||||
|
||||
### Why go-analyze/charts
|
||||
|
||||
- Pure Go, no CGO — builds cleanly inside the live-build container
|
||||
@@ -29,7 +57,8 @@ self-contained SVG renderer used **only** for completed SAT run reports
|
||||
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
||||
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
||||
|
||||
Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
|
||||
Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
|
||||
the legend is hidden. The page renders them at `width: 100%` in a
|
||||
single-column layout so they always fill the viewport width.
|
||||
|
||||
### Ring buffers
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
||||
|
||||
**Date:** 2026-04-01
|
||||
**Status:** active
|
||||
**Status:** resolved
|
||||
|
||||
## Context
|
||||
|
||||
@@ -58,6 +58,18 @@ Root cause of the false alarm:
|
||||
- as a result, we re-entered the same memtest investigation loop even though
|
||||
the real ISO was already correct
|
||||
|
||||
Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
|
||||
|
||||
- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
|
||||
still carried live-build's default memtest layout (`live/memtest.bin`,
|
||||
`live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
|
||||
- that mismatch is expected to trigger project recovery, because `bee` requires
|
||||
`boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
|
||||
- however, `build.sh` exited before recovery because `set -e` treated a direct
|
||||
`iso_memtest_present` return code of `1` as fatal
|
||||
- so the next repeated loop was caused by shell control flow, not by proof that
|
||||
the recovery design itself was wrong
|
||||
|
||||
## Known Failed Attempts
|
||||
|
||||
These approaches were already tried and should not be repeated blindly:
|
||||
@@ -102,6 +114,8 @@ Any future memtest fix must explicitly identify:
|
||||
- and a post-build proof from a real ISO, not only from intermediate workdir files
|
||||
- whether the ISO inspection step itself succeeded, rather than merely whether
|
||||
the validator printed a memtest warning
|
||||
- whether a non-zero probe is intentionally handled inside an `if` / `case`
|
||||
context rather than accidentally tripping `set -e`
|
||||
|
||||
## Decision
|
||||
|
||||
@@ -134,6 +148,8 @@ Current implementation direction:
|
||||
- install a stable ISO reader in the builder image
|
||||
- fail with an explicit reader error if ISO listing/extraction fails
|
||||
- do not treat reader failure as evidence that memtest is missing
|
||||
- do not call a probe that may return "needs recovery" as a bare command under
|
||||
`set -e`; wrap it in explicit control flow
|
||||
|
||||
## Consequences
|
||||
|
||||
@@ -144,3 +160,65 @@ Current implementation direction:
|
||||
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
||||
"missing memtest" warning without a successful ISO read is not evidence.
|
||||
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
||||
|
||||
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
|
||||
|
||||
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
|
||||
|
||||
### Components
|
||||
|
||||
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
|
||||
|
||||
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
|
||||
those files may not exist yet. Instead:
|
||||
|
||||
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
|
||||
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
|
||||
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
|
||||
If they do not exist, the hook warns and continues (does not fail).
|
||||
|
||||
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
|
||||
|
||||
**2. Post-`lb build` recovery step in `build.sh`**
|
||||
|
||||
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
|
||||
contains all required memtest artifacts. If not:
|
||||
|
||||
- Copies/extracts memtest binaries into `binary/boot/`.
|
||||
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
|
||||
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
|
||||
the ISO with the patched tree.
|
||||
|
||||
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
|
||||
step handles the final `binary/` tree after live-build has written all bootloader configs.
|
||||
|
||||
**3. ISO validation hardening**
|
||||
|
||||
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
|
||||
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
|
||||
handled — it does not abort the build prematurely.
|
||||
|
||||
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
|
||||
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
|
||||
This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
|
||||
|
||||
### Why this works when earlier attempts did not
|
||||
|
||||
The earlier patterns all shared a single flaw: they assumed a single build-time point
|
||||
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
|
||||
In live-build on Debian Bookworm that assumption is false — live-build continues writing
|
||||
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
|
||||
|
||||
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
|
||||
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
|
||||
There is no ordering dependency to get wrong.
|
||||
|
||||
### Do not revert
|
||||
|
||||
Do not remove the recovery step or the hook without a fresh real ISO build proving
|
||||
live-build alone produces all four required artifacts:
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- memtest entry in `boot/grub/grub.cfg`
|
||||
- memtest entry in `isolinux/live.cfg`
|
||||
|
||||
@@ -8,7 +8,7 @@ NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=4.5.2-1
|
||||
DCGM_VERSION=4.5.3-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
ROCM_VERSION=6.3.4
|
||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||
|
||||
@@ -32,7 +32,7 @@ lb config noauto \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
@@ -591,9 +591,12 @@ recover_iso_memtest() {
|
||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||
LOG_DIR="${DIST_DIR}/${ISO_BASENAME}.logs"
|
||||
LOG_ARCHIVE="${DIST_DIR}/${ISO_BASENAME}.logs.tar.gz"
|
||||
ISO_OUT="${DIST_DIR}/${ISO_BASENAME}.iso"
|
||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
||||
mkdir -p "${OUT_DIR}"
|
||||
LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
|
||||
LOG_ARCHIVE="${OUT_DIR}/${ISO_BASENAME}.logs.tar.gz"
|
||||
ISO_OUT="${OUT_DIR}/${ISO_BASENAME}.iso"
|
||||
LOG_OUT="${LOG_DIR}/build.log"
|
||||
|
||||
cleanup_build_log() {
|
||||
@@ -616,7 +619,8 @@ cleanup_build_log() {
|
||||
|
||||
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ] && command -v tar >/dev/null 2>&1; then
|
||||
rm -f "${LOG_ARCHIVE}"
|
||||
tar -czf "${LOG_ARCHIVE}" -C "${DIST_DIR}" "$(basename "${LOG_DIR}")" 2>/dev/null || true
|
||||
tar -czf "${LOG_ARCHIVE}" -C "$(dirname "${LOG_DIR}")" "$(basename "${LOG_DIR}")" 2>/dev/null || true
|
||||
rm -rf "${LOG_DIR}"
|
||||
fi
|
||||
|
||||
exit "${status}"
|
||||
@@ -862,7 +866,6 @@ rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||
@@ -1136,13 +1139,16 @@ fi
|
||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||
if [ -f "$ISO_RAW" ]; then
|
||||
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
|
||||
iso_memtest_present "$ISO_RAW"
|
||||
memtest_status=$?
|
||||
if [ "$memtest_status" -eq 1 ]; then
|
||||
recover_iso_memtest "${LB_DIR}" "$ISO_RAW"
|
||||
dump_memtest_debug "post-recovery" "${LB_DIR}" "$ISO_RAW"
|
||||
elif [ "$memtest_status" -eq 2 ]; then
|
||||
memtest_fail "failed to inspect ISO for memtest before recovery" "$ISO_RAW"
|
||||
if iso_memtest_present "$ISO_RAW"; then
|
||||
:
|
||||
else
|
||||
memtest_status=$?
|
||||
if [ "$memtest_status" -eq 1 ]; then
|
||||
recover_iso_memtest "${LB_DIR}" "$ISO_RAW"
|
||||
dump_memtest_debug "post-recovery" "${LB_DIR}" "$ISO_RAW"
|
||||
elif [ "$memtest_status" -eq 2 ]; then
|
||||
memtest_fail "failed to inspect ISO for memtest before recovery" "$ISO_RAW"
|
||||
fi
|
||||
fi
|
||||
validate_iso_memtest "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# AMD GPU firmware
|
||||
firmware-amd-graphics
|
||||
|
||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||
|
||||
@@ -71,9 +71,7 @@ lightdm
|
||||
firmware-linux-free
|
||||
firmware-linux-nonfree
|
||||
firmware-misc-nonfree
|
||||
firmware-amd-graphics
|
||||
firmware-realtek
|
||||
firmware-intel-sound
|
||||
firmware-bnx2
|
||||
firmware-bnx2x
|
||||
firmware-cavium
|
||||
|
||||
@@ -52,6 +52,14 @@ else
|
||||
fail "nvidia-smi: NOT FOUND"
|
||||
fi
|
||||
|
||||
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
ok "$tool found: $p"
|
||||
else
|
||||
fail "$tool: NOT FOUND"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "-- NVIDIA modules --"
|
||||
KO_DIR="/usr/local/lib/nvidia"
|
||||
|
||||
@@ -190,4 +190,16 @@ CHOSEN_FORMAT=$(choose_format) || {
|
||||
}
|
||||
|
||||
echo "format=${CHOSEN_FORMAT}"
|
||||
exec ./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${JOHN_DEVICES}"
|
||||
PIDS=""
|
||||
_first=1
|
||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||
[ "${_first}" = "1" ] || sleep 3
|
||||
_first=0
|
||||
./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
|
||||
PIDS="${PIDS} $!"
|
||||
done
|
||||
FAIL=0
|
||||
for pid in ${PIDS}; do
|
||||
wait "${pid}" || FAIL=$((FAIL+1))
|
||||
done
|
||||
[ "${FAIL}" -eq 0 ] || { echo "john: ${FAIL} device(s) failed" >&2; exit 1; }
|
||||
|
||||
@@ -24,7 +24,7 @@ chromium \
|
||||
--no-first-run \
|
||||
--disable-session-crashed-bubble \
|
||||
--disable-features=TranslateUI \
|
||||
--start-fullscreen \
|
||||
--start-maximized \
|
||||
http://localhost/ &
|
||||
|
||||
exec openbox
|
||||
|
||||
@@ -3,6 +3,11 @@
|
||||
# Type 'a' at any prompt to abort, 'b' to go back.
|
||||
set -e
|
||||
|
||||
# Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed.
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
exec sudo "$0" "$@"
|
||||
fi
|
||||
|
||||
abort() { echo "Aborted."; exit 0; }
|
||||
|
||||
ask() {
|
||||
|
||||
Reference in New Issue
Block a user