Files
bee/audit/internal/platform/runtime.go
2026-03-16 18:20:26 +03:00

165 lines
4.3 KiB
Go

package platform
import (
"os"
"os/exec"
"strings"
"time"
"bee/audit/internal/schema"
)
var runtimeRequiredTools = []string{
"dmidecode",
"lspci",
"lsblk",
"smartctl",
"nvme",
"ipmitool",
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
"dhclient",
"mount",
}
var runtimeTrackedServices = []string{
"bee-network",
"bee-nvidia",
"bee-preflight",
"bee-audit",
"bee-web",
"bee-sshsetup",
}
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
checkedAt := time.Now().UTC().Format(time.RFC3339)
health := schema.RuntimeHealth{
Status: "OK",
CheckedAt: checkedAt,
ExportDir: strings.TrimSpace(exportDir),
}
if health.ExportDir != "" {
if err := os.MkdirAll(health.ExportDir, 0755); err != nil {
health.Status = "FAILED"
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "export_dir_unavailable",
Severity: "critical",
Description: err.Error(),
})
}
}
interfaces, err := s.ListInterfaces()
if err == nil {
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
hasIPv4 := false
missingIPv4 := false
for _, iface := range interfaces {
outcome := "no_offer"
if len(iface.IPv4) > 0 {
outcome = "lease_acquired"
hasIPv4 = true
} else if strings.EqualFold(iface.State, "DOWN") {
outcome = "link_down"
} else {
missingIPv4 = true
}
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
Name: iface.Name,
State: iface.State,
IPv4: iface.IPv4,
Outcome: outcome,
})
}
switch {
case hasIPv4 && !missingIPv4:
health.NetworkStatus = "OK"
case hasIPv4:
health.NetworkStatus = "PARTIAL"
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "dhcp_partial",
Severity: "warning",
Description: "At least one interface did not obtain IPv4 connectivity.",
})
default:
health.NetworkStatus = "FAILED"
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "dhcp_failed",
Severity: "warning",
Description: "No physical interface obtained IPv4 connectivity.",
})
}
}
for _, tool := range s.CheckTools(runtimeRequiredTools) {
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
Name: tool.Name,
Path: tool.Path,
OK: tool.OK,
})
if !tool.OK {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "tool_missing",
Severity: "warning",
Description: "Required tool missing: " + tool.Name,
})
}
}
for _, name := range runtimeTrackedServices {
health.Services = append(health.Services, schema.RuntimeServiceStatus{
Name: name,
Status: s.ServiceState(name),
})
}
lsmodText := commandText("lsmod")
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
health.CUDAReady = false
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
if health.Status != "FAILED" && len(health.Issues) > 0 {
health.Status = "PARTIAL"
}
return health, nil
}
func commandText(name string, args ...string) string {
raw, err := exec.Command(name, args...).CombinedOutput()
if err != nil && len(raw) == 0 {
return ""
}
return string(raw)
}