165 lines
4.3 KiB
Go
165 lines
4.3 KiB
Go
package platform
|
|
|
|
import (
|
|
"os"
|
|
"os/exec"
|
|
"strings"
|
|
"time"
|
|
|
|
"bee/audit/internal/schema"
|
|
)
|
|
|
|
var runtimeRequiredTools = []string{
|
|
"dmidecode",
|
|
"lspci",
|
|
"lsblk",
|
|
"smartctl",
|
|
"nvme",
|
|
"ipmitool",
|
|
"nvidia-smi",
|
|
"nvidia-bug-report.sh",
|
|
"bee-gpu-stress",
|
|
"dhclient",
|
|
"mount",
|
|
}
|
|
|
|
var runtimeTrackedServices = []string{
|
|
"bee-network",
|
|
"bee-nvidia",
|
|
"bee-preflight",
|
|
"bee-audit",
|
|
"bee-web",
|
|
"bee-sshsetup",
|
|
}
|
|
|
|
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
|
checkedAt := time.Now().UTC().Format(time.RFC3339)
|
|
health := schema.RuntimeHealth{
|
|
Status: "OK",
|
|
CheckedAt: checkedAt,
|
|
ExportDir: strings.TrimSpace(exportDir),
|
|
}
|
|
|
|
if health.ExportDir != "" {
|
|
if err := os.MkdirAll(health.ExportDir, 0755); err != nil {
|
|
health.Status = "FAILED"
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "export_dir_unavailable",
|
|
Severity: "critical",
|
|
Description: err.Error(),
|
|
})
|
|
}
|
|
}
|
|
|
|
interfaces, err := s.ListInterfaces()
|
|
if err == nil {
|
|
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
|
hasIPv4 := false
|
|
missingIPv4 := false
|
|
for _, iface := range interfaces {
|
|
outcome := "no_offer"
|
|
if len(iface.IPv4) > 0 {
|
|
outcome = "lease_acquired"
|
|
hasIPv4 = true
|
|
} else if strings.EqualFold(iface.State, "DOWN") {
|
|
outcome = "link_down"
|
|
} else {
|
|
missingIPv4 = true
|
|
}
|
|
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
|
Name: iface.Name,
|
|
State: iface.State,
|
|
IPv4: iface.IPv4,
|
|
Outcome: outcome,
|
|
})
|
|
}
|
|
switch {
|
|
case hasIPv4 && !missingIPv4:
|
|
health.NetworkStatus = "OK"
|
|
case hasIPv4:
|
|
health.NetworkStatus = "PARTIAL"
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "dhcp_partial",
|
|
Severity: "warning",
|
|
Description: "At least one interface did not obtain IPv4 connectivity.",
|
|
})
|
|
default:
|
|
health.NetworkStatus = "FAILED"
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "dhcp_failed",
|
|
Severity: "warning",
|
|
Description: "No physical interface obtained IPv4 connectivity.",
|
|
})
|
|
}
|
|
}
|
|
|
|
for _, tool := range s.CheckTools(runtimeRequiredTools) {
|
|
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
|
|
Name: tool.Name,
|
|
Path: tool.Path,
|
|
OK: tool.OK,
|
|
})
|
|
if !tool.OK {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "tool_missing",
|
|
Severity: "warning",
|
|
Description: "Required tool missing: " + tool.Name,
|
|
})
|
|
}
|
|
}
|
|
|
|
for _, name := range runtimeTrackedServices {
|
|
health.Services = append(health.Services, schema.RuntimeServiceStatus{
|
|
Name: name,
|
|
Status: s.ServiceState(name),
|
|
})
|
|
}
|
|
|
|
lsmodText := commandText("lsmod")
|
|
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
|
if !health.DriverReady {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "nvidia_kernel_module_missing",
|
|
Severity: "warning",
|
|
Description: "NVIDIA kernel module is not loaded.",
|
|
})
|
|
}
|
|
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "nvidia_modeset_failed",
|
|
Severity: "warning",
|
|
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
|
})
|
|
}
|
|
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
|
health.DriverReady = true
|
|
}
|
|
|
|
health.CUDAReady = false
|
|
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
|
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
|
if err == nil {
|
|
health.CUDAReady = true
|
|
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "cuda_runtime_not_ready",
|
|
Severity: "warning",
|
|
Description: "CUDA runtime is not ready for GPU SAT.",
|
|
})
|
|
}
|
|
}
|
|
|
|
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
|
health.Status = "PARTIAL"
|
|
}
|
|
return health, nil
|
|
}
|
|
|
|
func commandText(name string, args ...string) string {
|
|
raw, err := exec.Command(name, args...).CombinedOutput()
|
|
if err != nil && len(raw) == 0 {
|
|
return ""
|
|
}
|
|
return string(raw)
|
|
}
|