A (hardware-ingest-json v2.8-2.9): remove sensor location fields from schema and collector; tag HardwareMemory.Location as json:"-"; add PlatformConfig to HardwareSnapshot. B (no-hardcoded-vendors): consolidate PCI vendor IDs into collector/pci_vendors.go; replace all vendor-name string checks in isGPUDevice, isNVIDIADevice, isMellanoxDevice, isAMDGPUDevice, matchesGPUVendor (sat_overlay), and validateIsVendorGPU (page_validate) with numeric vendor_id comparisons. C (module-structure): split app/app.go (1413 lines) into app.go + app_format.go, app_network.go, app_services.go, app_packs.go, app_install.go — no logic changes. D (go-code-style): wrap bare return err in interfaceAdminState and interfaceIPv4Addrs (platform/network.go) with fmt.Errorf context including the interface name. E (go-project-bible): add bible-local/architecture/data-model.md and bible-local/architecture/api-surface.md. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
436 lines
15 KiB
Go
436 lines
15 KiB
Go
package app
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"bee/audit/internal/collector"
|
|
"bee/audit/internal/platform"
|
|
"bee/audit/internal/runtimeenv"
|
|
"bee/audit/internal/schema"
|
|
)
|
|
|
|
var (
|
|
DefaultExportDir = "/appdata/bee/export"
|
|
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
|
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
|
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
|
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
|
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
|
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
|
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
|
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
|
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
|
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
|
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
|
DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune"
|
|
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
|
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
|
DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
|
|
)
|
|
|
|
type App struct {
|
|
network networkManager
|
|
services serviceManager
|
|
exports exportManager
|
|
tools toolManager
|
|
sat satRunner
|
|
runtime runtimeChecker
|
|
installer installer
|
|
// StatusDB is the unified component health store (nil if unavailable).
|
|
StatusDB *ComponentStatusDB
|
|
}
|
|
|
|
type ActionResult struct {
|
|
Title string
|
|
Body string
|
|
}
|
|
|
|
type networkManager interface {
|
|
ListInterfaces() ([]platform.InterfaceInfo, error)
|
|
DefaultRoute() string
|
|
DHCPOne(iface string) (string, error)
|
|
DHCPAll() (string, error)
|
|
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
|
SetInterfaceState(iface string, up bool) error
|
|
GetInterfaceState(iface string) (bool, error)
|
|
CaptureNetworkSnapshot() (platform.NetworkSnapshot, error)
|
|
RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error
|
|
}
|
|
|
|
type serviceManager interface {
|
|
ListBeeServices() ([]string, error)
|
|
ServiceState(name string) string
|
|
ServiceStatus(name string) (string, error)
|
|
ServiceDo(name string, action platform.ServiceAction) (string, error)
|
|
}
|
|
|
|
type exportManager interface {
|
|
ListRemovableTargets() ([]platform.RemovableTarget, error)
|
|
ExportFileToTarget(src string, target platform.RemovableTarget) (string, error)
|
|
}
|
|
|
|
type toolManager interface {
|
|
TailFile(path string, lines int) string
|
|
CheckTools(names []string) []platform.ToolStatus
|
|
}
|
|
|
|
type installer interface {
|
|
ListInstallDisks() ([]platform.InstallDisk, error)
|
|
InstallToDisk(ctx context.Context, device string, logFile string) error
|
|
IsLiveMediaInRAM() bool
|
|
LiveBootSource() platform.LiveBootSource
|
|
LiveMediaRAMState() platform.LiveMediaRAMState
|
|
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
|
}
|
|
|
|
type GPUPresenceResult struct {
|
|
Nvidia bool
|
|
AMD bool
|
|
}
|
|
|
|
func (a *App) DetectGPUPresence() GPUPresenceResult {
|
|
vendor := a.sat.DetectGPUVendor()
|
|
return GPUPresenceResult{
|
|
Nvidia: vendor == "nvidia",
|
|
AMD: vendor == "amd",
|
|
}
|
|
}
|
|
|
|
func (a *App) IsLiveMediaInRAM() bool {
|
|
return a.installer.IsLiveMediaInRAM()
|
|
}
|
|
|
|
func (a *App) LiveBootSource() platform.LiveBootSource {
|
|
return a.installer.LiveBootSource()
|
|
}
|
|
|
|
func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
|
|
return a.installer.LiveMediaRAMState()
|
|
}
|
|
|
|
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
|
return a.installer.RunInstallToRAM(ctx, logFunc)
|
|
}
|
|
|
|
type satRunner interface {
|
|
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
|
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
|
RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
|
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
|
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
|
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
|
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
|
|
ResetNvidiaGPU(index int) (string, error)
|
|
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
|
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
|
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
|
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
|
DetectGPUVendor() string
|
|
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
|
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
|
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
|
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
|
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
|
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
|
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
|
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
|
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
|
}
|
|
|
|
type runtimeChecker interface {
|
|
CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error)
|
|
CaptureTechnicalDump(baseDir string) error
|
|
}
|
|
|
|
func New(platform *platform.System) *App {
|
|
a := &App{
|
|
network: platform,
|
|
services: platform,
|
|
exports: platform,
|
|
tools: platform,
|
|
sat: platform,
|
|
runtime: platform,
|
|
installer: platform,
|
|
}
|
|
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
|
a.StatusDB = db
|
|
}
|
|
return a
|
|
}
|
|
|
|
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
|
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
|
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
|
snap, err := readAuditSnapshot(auditJSON)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
|
return json.MarshalIndent(snap, "", " ")
|
|
}
|
|
|
|
func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
|
|
var snap schema.HardwareIngestRequest
|
|
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
|
return schema.HardwareIngestRequest{}, err
|
|
}
|
|
collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
|
|
return snap, nil
|
|
}
|
|
|
|
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
|
if runtimeMode == runtimeenv.ModeLiveCD {
|
|
if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
|
|
slog.Warn("capture technical dump", "err", err)
|
|
}
|
|
}
|
|
result := collector.Run(runtimeMode)
|
|
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
|
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
|
|
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
|
result.Runtime = &health
|
|
}
|
|
data, err := json.MarshalIndent(result, "", " ")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
switch {
|
|
case output == "stdout":
|
|
_, err := os.Stdout.Write(append(data, '\n'))
|
|
return "stdout", err
|
|
case strings.HasPrefix(output, "file:"):
|
|
path := strings.TrimPrefix(output, "file:")
|
|
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
|
return "", err
|
|
}
|
|
return path, nil
|
|
default:
|
|
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
|
|
}
|
|
}
|
|
|
|
func (a *App) RunRuntimePreflight(output string) (string, error) {
|
|
health, err := a.runtime.CollectRuntimeHealth(DefaultExportDir)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
data, err := json.MarshalIndent(health, "", " ")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
switch {
|
|
case output == "stdout":
|
|
_, err := os.Stdout.Write(append(data, '\n'))
|
|
return "stdout", err
|
|
case strings.HasPrefix(output, "file:"):
|
|
path := strings.TrimPrefix(output, "file:")
|
|
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
|
return "", err
|
|
}
|
|
return path, nil
|
|
default:
|
|
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
|
|
}
|
|
}
|
|
|
|
func (a *App) RunRuntimePreflightResult() (ActionResult, error) {
|
|
path, err := a.RunRuntimePreflight("file:" + DefaultRuntimeJSONPath)
|
|
body := "Runtime preflight completed."
|
|
if path != "" {
|
|
body = "Runtime health written to " + path
|
|
}
|
|
return ActionResult{Title: "Run self-check", Body: body}, err
|
|
}
|
|
|
|
func (a *App) RuntimeHealthResult() ActionResult {
|
|
health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath)
|
|
if err != nil {
|
|
return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
|
|
}
|
|
driverLabel := "Driver ready"
|
|
accelLabel := "CUDA ready"
|
|
switch a.sat.DetectGPUVendor() {
|
|
case "amd":
|
|
driverLabel = "AMDGPU ready"
|
|
accelLabel = "ROCm SMI ready"
|
|
case "nvidia":
|
|
driverLabel = "NVIDIA ready"
|
|
}
|
|
var body strings.Builder
|
|
fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
|
|
fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
|
|
fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady)
|
|
fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady)
|
|
fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
|
|
if len(health.Issues) > 0 {
|
|
body.WriteString("\n\nIssues:\n")
|
|
for _, issue := range health.Issues {
|
|
fmt.Fprintf(&body, "- %s: %s\n", issue.Code, issue.Description)
|
|
}
|
|
}
|
|
return ActionResult{Title: "Runtime issues", Body: strings.TrimSpace(body.String())}
|
|
}
|
|
|
|
func (a *App) RunAuditNow(runtimeMode runtimeenv.Mode) (ActionResult, error) {
|
|
path, err := a.RunAudit(runtimeMode, "file:"+DefaultAuditJSONPath)
|
|
body := "Audit completed."
|
|
if path != "" {
|
|
body = "Audit output: " + path
|
|
}
|
|
return ActionResult{Title: "Run audit", Body: body}, err
|
|
}
|
|
|
|
func (a *App) RunAuditToDefaultFile(runtimeMode runtimeenv.Mode) (string, error) {
|
|
return a.RunAudit(runtimeMode, "file:"+DefaultAuditJSONPath)
|
|
}
|
|
|
|
func (a *App) HealthSummaryResult() ActionResult {
|
|
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
|
if err != nil {
|
|
return ActionResult{Title: "Health summary", Body: "No audit JSON found."}
|
|
}
|
|
var snapshot schema.HardwareIngestRequest
|
|
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
|
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
|
}
|
|
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
|
|
|
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
|
var body strings.Builder
|
|
status := summary.Status
|
|
if status == "" {
|
|
status = "Unknown"
|
|
}
|
|
fmt.Fprintf(&body, "Overall: %s\n", status)
|
|
fmt.Fprintf(&body, "Storage: warn=%d fail=%d\n", summary.StorageWarn, summary.StorageFail)
|
|
fmt.Fprintf(&body, "PCIe: warn=%d fail=%d\n", summary.PCIeWarn, summary.PCIeFail)
|
|
fmt.Fprintf(&body, "PSU: warn=%d fail=%d\n", summary.PSUWarn, summary.PSUFail)
|
|
fmt.Fprintf(&body, "Memory: warn=%d fail=%d\n", summary.MemoryWarn, summary.MemoryFail)
|
|
for _, item := range latestSATSummaries() {
|
|
fmt.Fprintf(&body, "\n\n%s", item)
|
|
}
|
|
if len(summary.Failures) > 0 {
|
|
fmt.Fprintf(&body, "\n\nFailures:\n- %s", strings.Join(summary.Failures, "\n- "))
|
|
}
|
|
if len(summary.Warnings) > 0 {
|
|
fmt.Fprintf(&body, "\n\nWarnings:\n- %s", strings.Join(summary.Warnings, "\n- "))
|
|
}
|
|
return ActionResult{Title: "Health summary", Body: strings.TrimSpace(body.String())}
|
|
}
|
|
|
|
func (a *App) MainBanner() string {
|
|
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
var snapshot schema.HardwareIngestRequest
|
|
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
|
return ""
|
|
}
|
|
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
|
|
|
var lines []string
|
|
if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
|
|
lines = append(lines, system)
|
|
}
|
|
if cpu := formatCPULine(snapshot.Hardware.CPUs); cpu != "" {
|
|
lines = append(lines, cpu)
|
|
}
|
|
if memory := formatMemoryLine(snapshot.Hardware.Memory); memory != "" {
|
|
lines = append(lines, memory)
|
|
}
|
|
if storage := formatStorageLine(snapshot.Hardware.Storage); storage != "" {
|
|
lines = append(lines, storage)
|
|
}
|
|
if gpu := formatGPULine(snapshot.Hardware.PCIeDevices); gpu != "" {
|
|
lines = append(lines, gpu)
|
|
}
|
|
if ip := formatIPLine(a.network.ListInterfaces); ip != "" {
|
|
lines = append(lines, ip)
|
|
}
|
|
|
|
return strings.TrimSpace(strings.Join(lines, "\n"))
|
|
}
|
|
|
|
func (a *App) FormatToolStatuses(statuses []platform.ToolStatus) string {
|
|
var body strings.Builder
|
|
for _, tool := range statuses {
|
|
status := "MISSING"
|
|
if tool.OK {
|
|
status = "OK (" + tool.Path + ")"
|
|
}
|
|
fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status)
|
|
}
|
|
return strings.TrimSpace(body.String())
|
|
}
|
|
|
|
func (a *App) ParsePrefix(raw string, fallback int) int {
|
|
value, err := strconv.Atoi(strings.TrimSpace(raw))
|
|
if err != nil || value <= 0 {
|
|
return fallback
|
|
}
|
|
return value
|
|
}
|
|
|
|
// writePSUStatusesToDB records PSU statuses collected during audit into the
|
|
// component-status DB so they are visible in the Hardware Summary card.
|
|
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
|
|
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
|
|
if db == nil || len(psus) == 0 {
|
|
return
|
|
}
|
|
const source = "audit:ipmi"
|
|
worstStatus := "OK"
|
|
for _, psu := range psus {
|
|
if psu.Status == nil {
|
|
continue
|
|
}
|
|
slot := "?"
|
|
if psu.Slot != nil {
|
|
slot = *psu.Slot
|
|
}
|
|
st := *psu.Status
|
|
detail := ""
|
|
if psu.ErrorDescription != nil {
|
|
detail = *psu.ErrorDescription
|
|
}
|
|
db.Record("psu:"+slot, source, st, detail)
|
|
switch st {
|
|
case "Critical":
|
|
worstStatus = "Critical"
|
|
case "Warning":
|
|
if worstStatus != "Critical" {
|
|
worstStatus = "Warning"
|
|
}
|
|
}
|
|
}
|
|
db.Record("psu:all", source, worstStatus, "")
|
|
}
|
|
|
|
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return schema.RuntimeHealth{}, err
|
|
}
|
|
var health schema.RuntimeHealth
|
|
if err := json.Unmarshal(raw, &health); err != nil {
|
|
return schema.RuntimeHealth{}, err
|
|
}
|
|
return health, nil
|
|
}
|