package app import ( "context" "encoding/json" "fmt" "log/slog" "os" "strconv" "strings" "bee/audit/internal/collector" "bee/audit/internal/platform" "bee/audit/internal/runtimeenv" "bee/audit/internal/schema" ) var ( DefaultExportDir = "/appdata/bee/export" DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json" DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log" DefaultWebLogPath = DefaultExportDir + "/bee-web.log" DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log" DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log" DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log" DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json" DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log" DefaultTechDumpDir = DefaultExportDir + "/techdump" DefaultSATBaseDir = DefaultExportDir + "/bee-sat" DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench" DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune" DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf" DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power" DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json" ) type App struct { network networkManager services serviceManager exports exportManager tools toolManager sat satRunner runtime runtimeChecker installer installer // StatusDB is the unified component health store (nil if unavailable). StatusDB *ComponentStatusDB } type ActionResult struct { Title string Body string } type networkManager interface { ListInterfaces() ([]platform.InterfaceInfo, error) DefaultRoute() string DHCPOne(iface string) (string, error) DHCPAll() (string, error) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) SetInterfaceState(iface string, up bool) error GetInterfaceState(iface string) (bool, error) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error } type serviceManager interface { ListBeeServices() ([]string, error) ServiceState(name string) string ServiceStatus(name string) (string, error) ServiceDo(name string, action platform.ServiceAction) (string, error) } type exportManager interface { ListRemovableTargets() ([]platform.RemovableTarget, error) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) } type toolManager interface { TailFile(path string, lines int) string CheckTools(names []string) []platform.ToolStatus } type installer interface { ListInstallDisks() ([]platform.InstallDisk, error) InstallToDisk(ctx context.Context, device string, logFile string) error IsLiveMediaInRAM() bool LiveBootSource() platform.LiveBootSource LiveMediaRAMState() platform.LiveMediaRAMState RunInstallToRAM(ctx context.Context, logFunc func(string)) error } type GPUPresenceResult struct { Nvidia bool AMD bool } func (a *App) DetectGPUPresence() GPUPresenceResult { vendor := a.sat.DetectGPUVendor() return GPUPresenceResult{ Nvidia: vendor == "nvidia", AMD: vendor == "amd", } } func (a *App) IsLiveMediaInRAM() bool { return a.installer.IsLiveMediaInRAM() } func (a *App) LiveBootSource() platform.LiveBootSource { return a.installer.LiveBootSource() } func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState { return a.installer.LiveMediaRAMState() } func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error { return a.installer.RunInstallToRAM(ctx, logFunc) } type satRunner interface { RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) ResetNvidiaGPU(index int) (string, error) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) DetectGPUVendor() string ListAMDGPUs() ([]platform.AMDGPUInfo, error) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) } type runtimeChecker interface { CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) CaptureTechnicalDump(baseDir string) error } func New(platform *platform.System) *App { a := &App{ network: platform, services: platform, exports: platform, tools: platform, sat: platform, runtime: platform, installer: platform, } if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil { a.StatusDB = db } return a } // ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results, // and returns the updated JSON. Used by the web UI to serve always-fresh status. func ApplySATOverlay(auditJSON []byte) ([]byte, error) { snap, err := readAuditSnapshot(auditJSON) if err != nil { return nil, err } applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil) return json.MarshalIndent(snap, "", " ") } func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) { var snap schema.HardwareIngestRequest if err := json.Unmarshal(auditJSON, &snap); err != nil { return schema.HardwareIngestRequest{}, err } collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt) return snap, nil } func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) { if runtimeMode == runtimeenv.ModeLiveCD { if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil { slog.Warn("capture technical dump", "err", err) } } result := collector.Run(runtimeMode) applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB) writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies) if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil { result.Runtime = &health } data, err := json.MarshalIndent(result, "", " ") if err != nil { return "", err } switch { case output == "stdout": _, err := os.Stdout.Write(append(data, '\n')) return "stdout", err case strings.HasPrefix(output, "file:"): path := strings.TrimPrefix(output, "file:") if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil { return "", err } return path, nil default: return "", fmt.Errorf("unknown output destination %q — use stdout or file:", output) } } func (a *App) RunRuntimePreflight(output string) (string, error) { health, err := a.runtime.CollectRuntimeHealth(DefaultExportDir) if err != nil { return "", err } data, err := json.MarshalIndent(health, "", " ") if err != nil { return "", err } switch { case output == "stdout": _, err := os.Stdout.Write(append(data, '\n')) return "stdout", err case strings.HasPrefix(output, "file:"): path := strings.TrimPrefix(output, "file:") if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil { return "", err } return path, nil default: return "", fmt.Errorf("unknown output destination %q — use stdout or file:", output) } } func (a *App) RunRuntimePreflightResult() (ActionResult, error) { path, err := a.RunRuntimePreflight("file:" + DefaultRuntimeJSONPath) body := "Runtime preflight completed." if path != "" { body = "Runtime health written to " + path } return ActionResult{Title: "Run self-check", Body: body}, err } func (a *App) RuntimeHealthResult() ActionResult { health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath) if err != nil { return ActionResult{Title: "Runtime issues", Body: "No runtime health found."} } driverLabel := "Driver ready" accelLabel := "CUDA ready" switch a.sat.DetectGPUVendor() { case "amd": driverLabel = "AMDGPU ready" accelLabel = "ROCm SMI ready" case "nvidia": driverLabel = "NVIDIA ready" } var body strings.Builder fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN")) fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir)) fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady) fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady) fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN")) if len(health.Issues) > 0 { body.WriteString("\n\nIssues:\n") for _, issue := range health.Issues { fmt.Fprintf(&body, "- %s: %s\n", issue.Code, issue.Description) } } return ActionResult{Title: "Runtime issues", Body: strings.TrimSpace(body.String())} } func (a *App) RunAuditNow(runtimeMode runtimeenv.Mode) (ActionResult, error) { path, err := a.RunAudit(runtimeMode, "file:"+DefaultAuditJSONPath) body := "Audit completed." if path != "" { body = "Audit output: " + path } return ActionResult{Title: "Run audit", Body: body}, err } func (a *App) RunAuditToDefaultFile(runtimeMode runtimeenv.Mode) (string, error) { return a.RunAudit(runtimeMode, "file:"+DefaultAuditJSONPath) } func (a *App) HealthSummaryResult() ActionResult { raw, err := os.ReadFile(DefaultAuditJSONPath) if err != nil { return ActionResult{Title: "Health summary", Body: "No audit JSON found."} } var snapshot schema.HardwareIngestRequest if err := json.Unmarshal(raw, &snapshot); err != nil { return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."} } collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt) summary := collector.BuildHealthSummary(snapshot.Hardware) var body strings.Builder status := summary.Status if status == "" { status = "Unknown" } fmt.Fprintf(&body, "Overall: %s\n", status) fmt.Fprintf(&body, "Storage: warn=%d fail=%d\n", summary.StorageWarn, summary.StorageFail) fmt.Fprintf(&body, "PCIe: warn=%d fail=%d\n", summary.PCIeWarn, summary.PCIeFail) fmt.Fprintf(&body, "PSU: warn=%d fail=%d\n", summary.PSUWarn, summary.PSUFail) fmt.Fprintf(&body, "Memory: warn=%d fail=%d\n", summary.MemoryWarn, summary.MemoryFail) for _, item := range latestSATSummaries() { fmt.Fprintf(&body, "\n\n%s", item) } if len(summary.Failures) > 0 { fmt.Fprintf(&body, "\n\nFailures:\n- %s", strings.Join(summary.Failures, "\n- ")) } if len(summary.Warnings) > 0 { fmt.Fprintf(&body, "\n\nWarnings:\n- %s", strings.Join(summary.Warnings, "\n- ")) } return ActionResult{Title: "Health summary", Body: strings.TrimSpace(body.String())} } func (a *App) MainBanner() string { raw, err := os.ReadFile(DefaultAuditJSONPath) if err != nil { return "" } var snapshot schema.HardwareIngestRequest if err := json.Unmarshal(raw, &snapshot); err != nil { return "" } collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt) var lines []string if system := formatSystemLine(snapshot.Hardware.Board); system != "" { lines = append(lines, system) } if cpu := formatCPULine(snapshot.Hardware.CPUs); cpu != "" { lines = append(lines, cpu) } if memory := formatMemoryLine(snapshot.Hardware.Memory); memory != "" { lines = append(lines, memory) } if storage := formatStorageLine(snapshot.Hardware.Storage); storage != "" { lines = append(lines, storage) } if gpu := formatGPULine(snapshot.Hardware.PCIeDevices); gpu != "" { lines = append(lines, gpu) } if ip := formatIPLine(a.network.ListInterfaces); ip != "" { lines = append(lines, ip) } return strings.TrimSpace(strings.Join(lines, "\n")) } func (a *App) FormatToolStatuses(statuses []platform.ToolStatus) string { var body strings.Builder for _, tool := range statuses { status := "MISSING" if tool.OK { status = "OK (" + tool.Path + ")" } fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status) } return strings.TrimSpace(body.String()) } func (a *App) ParsePrefix(raw string, fallback int) int { value, err := strconv.Atoi(strings.TrimSpace(raw)) if err != nil || value <= 0 { return fallback } return value } // writePSUStatusesToDB records PSU statuses collected during audit into the // component-status DB so they are visible in the Hardware Summary card. // PSU status is sourced from IPMI (ipmitool fru + sdr) during audit. func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) { if db == nil || len(psus) == 0 { return } const source = "audit:ipmi" worstStatus := "OK" for _, psu := range psus { if psu.Status == nil { continue } slot := "?" if psu.Slot != nil { slot = *psu.Slot } st := *psu.Status detail := "" if psu.ErrorDescription != nil { detail = *psu.ErrorDescription } db.Record("psu:"+slot, source, st, detail) switch st { case "Critical": worstStatus = "Critical" case "Warning": if worstStatus != "Critical" { worstStatus = "Warning" } } } db.Record("psu:all", source, worstStatus, "") } func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) { raw, err := os.ReadFile(path) if err != nil { return schema.RuntimeHealth{}, err } var health schema.RuntimeHealth if err := json.Unmarshal(raw, &health); err != nil { return schema.RuntimeHealth{}, err } return health, nil }