package platform import ( "os" "os/exec" "strings" "time" "bee/audit/internal/schema" ) var runtimeRequiredTools = []string{ "dmidecode", "lspci", "lsblk", "smartctl", "nvme", "ipmitool", "nvidia-smi", "nvidia-bug-report.sh", "bee-gpu-stress", "dhclient", "mount", } var runtimeTrackedServices = []string{ "bee-network", "bee-nvidia", "bee-preflight", "bee-audit", "bee-web", "bee-sshsetup", } func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) { checkedAt := time.Now().UTC().Format(time.RFC3339) health := schema.RuntimeHealth{ Status: "OK", CheckedAt: checkedAt, ExportDir: strings.TrimSpace(exportDir), } if health.ExportDir != "" { if err := os.MkdirAll(health.ExportDir, 0755); err != nil { health.Status = "FAILED" health.Issues = append(health.Issues, schema.RuntimeIssue{ Code: "export_dir_unavailable", Severity: "critical", Description: err.Error(), }) } } interfaces, err := s.ListInterfaces() if err == nil { health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces)) hasIPv4 := false missingIPv4 := false for _, iface := range interfaces { outcome := "no_offer" if len(iface.IPv4) > 0 { outcome = "lease_acquired" hasIPv4 = true } else if strings.EqualFold(iface.State, "DOWN") { outcome = "link_down" } else { missingIPv4 = true } health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{ Name: iface.Name, State: iface.State, IPv4: iface.IPv4, Outcome: outcome, }) } switch { case hasIPv4 && !missingIPv4: health.NetworkStatus = "OK" case hasIPv4: health.NetworkStatus = "PARTIAL" health.Issues = append(health.Issues, schema.RuntimeIssue{ Code: "dhcp_partial", Severity: "warning", Description: "At least one interface did not obtain IPv4 connectivity.", }) default: health.NetworkStatus = "FAILED" health.Issues = append(health.Issues, schema.RuntimeIssue{ Code: "dhcp_failed", Severity: "warning", Description: "No physical interface obtained IPv4 connectivity.", }) } } for _, tool := range s.CheckTools(runtimeRequiredTools) { health.Tools = append(health.Tools, schema.RuntimeToolStatus{ Name: tool.Name, Path: tool.Path, OK: tool.OK, }) if !tool.OK { health.Issues = append(health.Issues, schema.RuntimeIssue{ Code: "tool_missing", Severity: "warning", Description: "Required tool missing: " + tool.Name, }) } } for _, name := range runtimeTrackedServices { health.Services = append(health.Services, schema.RuntimeServiceStatus{ Name: name, Status: s.ServiceState(name), }) } lsmodText := commandText("lsmod") health.DriverReady = strings.Contains(lsmodText, "nvidia ") if !health.DriverReady { health.Issues = append(health.Issues, schema.RuntimeIssue{ Code: "nvidia_kernel_module_missing", Severity: "warning", Description: "NVIDIA kernel module is not loaded.", }) } if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") { health.Issues = append(health.Issues, schema.RuntimeIssue{ Code: "nvidia_modeset_failed", Severity: "warning", Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.", }) } if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" { health.DriverReady = true } health.CUDAReady = false if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil { out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput() if err == nil { health.CUDAReady = true } else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") { health.Issues = append(health.Issues, schema.RuntimeIssue{ Code: "cuda_runtime_not_ready", Severity: "warning", Description: "CUDA runtime is not ready for GPU SAT.", }) } } if health.Status != "FAILED" && len(health.Issues) > 0 { health.Status = "PARTIAL" } return health, nil } func commandText(name string, args ...string) string { raw, err := exec.Command(name, args...).CombinedOutput() if err != nil && len(raw) == 0 { return "" } return string(raw) }