- Task queue: all SAT/audit jobs enqueue and run one-at-a-time; tasks persist past page navigation; new Tasks page with cancel/priority/log stream - UI: consolidate nav (Validate, Burn, Tasks, Tools); Audit becomes modal; Dashboard hardware summary badges + split metrics charts (load/temp/power); Tools page consolidates network, services, install, support bundle - AMD GPU: acceptance test and stress burn cards; GPU presence API greys out irrelevant SAT cards automatically - Burn tests: Memory Stress (stress-ng --vm), SAT Stress (stressapptest) - Install to RAM: copies squashfs to /dev/shm, re-associates loop devices via LOOP_CHANGE_FD ioctl so live media can be ejected - Charts: relative time axis (0 = now, negative left) - memtester: LimitMEMLOCK=infinity in bee-web.service; empty output → UNSUPPORTED - SAT overlay applied dynamically on every /audit.json serve - MIME panic guard for LiveCD ramdisk I/O errors - ISO: add memtest86+, stressapptest packages; memtest86+ GRUB entry; disable screensaver/DPMS in bee-openbox-session - Unknown SAT status severity = 1 (does not override OK) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
219 lines
6.2 KiB
Go
219 lines
6.2 KiB
Go
package app
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
|
|
"bee/audit/internal/schema"
|
|
)
|
|
|
|
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
|
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
|
return
|
|
}
|
|
if summary, ok := loadLatestSATSummary(baseDir, "gpu-amd-"); ok {
|
|
applyGPUVendorSAT(snap.PCIeDevices, "amd", summary)
|
|
}
|
|
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
|
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
|
}
|
|
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
|
applyMemorySAT(snap.Memory, summary)
|
|
}
|
|
if summary, ok := loadLatestSATSummary(baseDir, "cpu-"); ok {
|
|
applyCPUSAT(snap.CPUs, summary)
|
|
}
|
|
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
|
applyStorageSAT(snap.Storage, summary)
|
|
}
|
|
}
|
|
|
|
type satSummary struct {
|
|
runAtUTC string
|
|
overall string
|
|
kv map[string]string
|
|
}
|
|
|
|
func loadLatestSATSummary(baseDir, prefix string) (satSummary, bool) {
|
|
matches, err := filepath.Glob(filepath.Join(baseDir, prefix+"*/summary.txt"))
|
|
if err != nil || len(matches) == 0 {
|
|
return satSummary{}, false
|
|
}
|
|
sort.Strings(matches)
|
|
raw, err := os.ReadFile(matches[len(matches)-1])
|
|
if err != nil {
|
|
return satSummary{}, false
|
|
}
|
|
kv := parseKeyValueSummary(string(raw))
|
|
return satSummary{
|
|
runAtUTC: strings.TrimSpace(kv["run_at_utc"]),
|
|
overall: strings.ToUpper(strings.TrimSpace(kv["overall_status"])),
|
|
kv: kv,
|
|
}, true
|
|
}
|
|
|
|
func applyGPUVendorSAT(devs []schema.HardwarePCIeDevice, vendor string, summary satSummary) {
|
|
status, description, ok := satSummaryStatus(summary, vendor+" GPU SAT")
|
|
if !ok {
|
|
return
|
|
}
|
|
for i := range devs {
|
|
if !matchesGPUVendor(devs[i], vendor) {
|
|
continue
|
|
}
|
|
mergeComponentStatus(&devs[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
|
}
|
|
}
|
|
|
|
func applyMemorySAT(dimms []schema.HardwareMemory, summary satSummary) {
|
|
status, description, ok := satSummaryStatus(summary, "memory SAT")
|
|
if !ok {
|
|
return
|
|
}
|
|
for i := range dimms {
|
|
mergeComponentStatus(&dimms[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
|
}
|
|
}
|
|
|
|
func applyCPUSAT(cpus []schema.HardwareCPU, summary satSummary) {
|
|
status, description, ok := satSummaryStatus(summary, "CPU SAT")
|
|
if !ok {
|
|
return
|
|
}
|
|
for i := range cpus {
|
|
mergeComponentStatus(&cpus[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
|
}
|
|
}
|
|
|
|
func applyStorageSAT(disks []schema.HardwareStorage, summary satSummary) {
|
|
byDevice := parseStorageSATStatus(summary)
|
|
for i := range disks {
|
|
devPath, _ := disks[i].Telemetry["linux_device"].(string)
|
|
devName := filepath.Base(strings.TrimSpace(devPath))
|
|
if devName == "" {
|
|
continue
|
|
}
|
|
result, ok := byDevice[devName]
|
|
if !ok {
|
|
continue
|
|
}
|
|
mergeComponentStatus(&disks[i].HardwareComponentStatus, summary.runAtUTC, result.status, result.description)
|
|
}
|
|
}
|
|
|
|
type satStatusResult struct {
|
|
status string
|
|
description string
|
|
ok bool
|
|
}
|
|
|
|
func parseStorageSATStatus(summary satSummary) map[string]satStatusResult {
|
|
result := map[string]satStatusResult{}
|
|
for key, value := range summary.kv {
|
|
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
|
continue
|
|
}
|
|
base := strings.TrimSuffix(key, "_status")
|
|
idx := strings.Index(base, "_")
|
|
if idx <= 0 {
|
|
continue
|
|
}
|
|
devName := base[:idx]
|
|
step := strings.ReplaceAll(base[idx+1:], "_", "-")
|
|
stepStatus, desc, ok := satKeyStatus(strings.ToUpper(strings.TrimSpace(value)), "storage "+step)
|
|
if !ok {
|
|
continue
|
|
}
|
|
current := result[devName]
|
|
if !current.ok || statusSeverity(stepStatus) > statusSeverity(current.status) {
|
|
result[devName] = satStatusResult{status: stepStatus, description: desc, ok: true}
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
|
|
return satKeyStatus(summary.overall, label)
|
|
}
|
|
|
|
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
|
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
|
case "OK":
|
|
// No error description on success — error_description is for problems only.
|
|
return "OK", "", true
|
|
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
|
// Tool couldn't run or test was incomplete — we can't assert hardware health.
|
|
return "Unknown", "", true
|
|
case "FAILED":
|
|
return "Critical", label + " failed", true
|
|
default:
|
|
return "", "", false
|
|
}
|
|
}
|
|
|
|
func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
|
if component == nil || satStatus == "" {
|
|
return
|
|
}
|
|
current := strings.TrimSpace(ptrString(component.Status))
|
|
if current == "" || current == "Unknown" || statusSeverity(satStatus) > statusSeverity(current) {
|
|
component.Status = appStringPtr(satStatus)
|
|
if strings.TrimSpace(description) != "" {
|
|
component.ErrorDescription = appStringPtr(description)
|
|
}
|
|
if strings.TrimSpace(changedAt) != "" {
|
|
component.StatusChangedAt = appStringPtr(changedAt)
|
|
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
|
Status: satStatus,
|
|
ChangedAt: changedAt,
|
|
Details: appStringPtr(description),
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
func statusSeverity(status string) int {
|
|
switch strings.TrimSpace(status) {
|
|
case "Critical":
|
|
return 3
|
|
case "Warning":
|
|
return 2
|
|
case "OK":
|
|
return 1
|
|
case "Unknown":
|
|
return 1 // same as OK — does not override OK from another source
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
|
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
|
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
|
return false
|
|
}
|
|
}
|
|
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
|
switch vendor {
|
|
case "amd":
|
|
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
|
case "nvidia":
|
|
return strings.Contains(manufacturer, "nvidia")
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func ptrString(v *string) string {
|
|
if v == nil {
|
|
return ""
|
|
}
|
|
return *v
|
|
}
|
|
|
|
func appStringPtr(value string) *string {
|
|
return &value
|
|
}
|