343 lines
9.7 KiB
Go
343 lines
9.7 KiB
Go
package platform
|
|
|
|
import (
|
|
"bufio"
|
|
"os"
|
|
"os/exec"
|
|
"strings"
|
|
"time"
|
|
|
|
"bee/audit/internal/schema"
|
|
)
|
|
|
|
var runtimeRequiredTools = []string{
|
|
"dmidecode",
|
|
"lspci",
|
|
"lsblk",
|
|
"smartctl",
|
|
"nvme",
|
|
"ipmitool",
|
|
"dhclient",
|
|
"mount",
|
|
}
|
|
|
|
var runtimeTrackedServices = []string{
|
|
"bee-network",
|
|
"bee-nvidia",
|
|
"bee-preflight",
|
|
"bee-audit",
|
|
"bee-web",
|
|
"bee-sshsetup",
|
|
}
|
|
|
|
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
|
checkedAt := time.Now().UTC().Format(time.RFC3339)
|
|
health := schema.RuntimeHealth{
|
|
Status: "OK",
|
|
CheckedAt: checkedAt,
|
|
ExportDir: strings.TrimSpace(exportDir),
|
|
}
|
|
|
|
if health.ExportDir != "" {
|
|
if err := os.MkdirAll(health.ExportDir, 0755); err != nil {
|
|
health.Status = "FAILED"
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "export_dir_unavailable",
|
|
Severity: "critical",
|
|
Description: err.Error(),
|
|
})
|
|
}
|
|
}
|
|
|
|
interfaces, err := s.ListInterfaces()
|
|
if err == nil {
|
|
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
|
hasIPv4 := false
|
|
missingIPv4 := false
|
|
for _, iface := range interfaces {
|
|
outcome := "no_offer"
|
|
if len(iface.IPv4) > 0 {
|
|
outcome = "lease_acquired"
|
|
hasIPv4 = true
|
|
} else if strings.EqualFold(iface.State, "DOWN") {
|
|
outcome = "link_down"
|
|
} else {
|
|
missingIPv4 = true
|
|
}
|
|
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
|
Name: iface.Name,
|
|
State: iface.State,
|
|
IPv4: iface.IPv4,
|
|
Outcome: outcome,
|
|
})
|
|
}
|
|
switch {
|
|
case hasIPv4 && !missingIPv4:
|
|
health.NetworkStatus = "OK"
|
|
case hasIPv4:
|
|
health.NetworkStatus = "PARTIAL"
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "dhcp_partial",
|
|
Severity: "warning",
|
|
Description: "At least one interface did not obtain IPv4 connectivity.",
|
|
})
|
|
default:
|
|
health.NetworkStatus = "FAILED"
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "dhcp_failed",
|
|
Severity: "warning",
|
|
Description: "No physical interface obtained IPv4 connectivity.",
|
|
})
|
|
}
|
|
}
|
|
|
|
vendor := s.DetectGPUVendor()
|
|
for _, tool := range s.runtimeToolStatuses(vendor) {
|
|
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
|
|
Name: tool.Name,
|
|
Path: tool.Path,
|
|
OK: tool.OK,
|
|
})
|
|
if !tool.OK {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "tool_missing",
|
|
Severity: "warning",
|
|
Description: "Required tool missing: " + tool.Name,
|
|
})
|
|
}
|
|
}
|
|
|
|
for _, name := range runtimeTrackedServices {
|
|
health.Services = append(health.Services, schema.RuntimeServiceStatus{
|
|
Name: name,
|
|
Status: s.ServiceState(name),
|
|
})
|
|
}
|
|
|
|
s.collectGPURuntimeHealth(vendor, &health)
|
|
s.collectToRAMHealth(&health)
|
|
s.collectUSBExportHealth(&health)
|
|
|
|
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
|
health.Status = "PARTIAL"
|
|
}
|
|
return health, nil
|
|
}
|
|
|
|
func commandText(name string, args ...string) string {
|
|
raw, err := exec.Command(name, args...).CombinedOutput()
|
|
if err != nil && len(raw) == 0 {
|
|
return ""
|
|
}
|
|
return string(raw)
|
|
}
|
|
|
|
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
|
tools := s.CheckTools(runtimeRequiredTools)
|
|
switch vendor {
|
|
case "nvidia":
|
|
tools = append(tools, s.CheckTools([]string{
|
|
"nvidia-smi",
|
|
"dcgmi",
|
|
"nv-hostengine",
|
|
"nvidia-bug-report.sh",
|
|
"bee-gpu-burn",
|
|
"bee-john-gpu-stress",
|
|
"bee-nccl-gpu-stress",
|
|
"all_reduce_perf",
|
|
})...)
|
|
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
|
|
case "amd":
|
|
tool := ToolStatus{Name: "rocm-smi"}
|
|
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
|
tool.Path = cmd[0]
|
|
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
|
|
tool.Path = cmd[1]
|
|
}
|
|
tool.OK = true
|
|
}
|
|
tools = append(tools, tool)
|
|
}
|
|
return tools
|
|
}
|
|
|
|
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
|
for _, candidate := range candidates {
|
|
path, err := exec.LookPath(candidate)
|
|
if err == nil {
|
|
return ToolStatus{Name: display, Path: path, OK: true}
|
|
}
|
|
}
|
|
return ToolStatus{Name: display}
|
|
}
|
|
|
|
// collectToRAMHealth evaluates whether the live system is fully running from RAM.
|
|
// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
|
|
// incomplete RAM copy exists but runtime still depends on the boot medium,
|
|
// "failed" = toram was requested but medium is not in RAM.
|
|
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
|
|
state := s.LiveMediaRAMState()
|
|
health.ToRAMStatus = state.Status
|
|
switch state.Status {
|
|
case "ok":
|
|
return
|
|
case "failed":
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "toram_copy_failed",
|
|
Severity: "warning",
|
|
Description: state.Message,
|
|
})
|
|
case "partial":
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "toram_copy_partial",
|
|
Severity: "warning",
|
|
Description: state.Message,
|
|
})
|
|
}
|
|
}
|
|
|
|
// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
|
|
// suitable for log export. Sets USBExportPath to the first match found.
|
|
func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
|
|
health.USBExportPath = findUSBExportMount()
|
|
}
|
|
|
|
// findUSBExportMount returns the mount point of the first writable USB filesystem
|
|
// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
|
|
// has USB transport. Returns "" if none found.
|
|
func findUSBExportMount() string {
|
|
f, err := os.Open("/proc/mounts")
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
defer f.Close()
|
|
|
|
// fs types that are expected on USB export drives
|
|
exportFSTypes := map[string]bool{
|
|
"vfat": true,
|
|
"exfat": true,
|
|
"ext2": true,
|
|
"ext3": true,
|
|
"ext4": true,
|
|
"ntfs": true,
|
|
"ntfs3": true,
|
|
"fuseblk": true,
|
|
}
|
|
|
|
scanner := bufio.NewScanner(f)
|
|
for scanner.Scan() {
|
|
// fields: device mountpoint fstype options dump pass
|
|
fields := strings.Fields(scanner.Text())
|
|
if len(fields) < 4 {
|
|
continue
|
|
}
|
|
device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
|
|
if !exportFSTypes[strings.ToLower(fsType)] {
|
|
continue
|
|
}
|
|
// Skip read-only mounts
|
|
opts := strings.Split(options, ",")
|
|
readOnly := false
|
|
for _, o := range opts {
|
|
if strings.TrimSpace(o) == "ro" {
|
|
readOnly = true
|
|
break
|
|
}
|
|
}
|
|
if readOnly {
|
|
continue
|
|
}
|
|
// Check USB transport via lsblk on the device (or its parent disk for partitions).
|
|
if !strings.HasPrefix(device, "/dev/") {
|
|
continue
|
|
}
|
|
checkDev := device
|
|
// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
|
|
// Strip trailing partition digits to get the parent disk name.
|
|
if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
|
|
checkDev = trimmed
|
|
}
|
|
if blockDeviceTransport(checkDev) == "usb" {
|
|
return mountPoint
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
|
lsmodText := commandText("lsmod")
|
|
|
|
switch vendor {
|
|
case "nvidia":
|
|
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
|
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
|
|
if health.NvidiaGSPMode == "gsp-stuck" {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "nvidia_gsp_stuck",
|
|
Severity: "critical",
|
|
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
|
|
})
|
|
} else if health.NvidiaGSPMode == "gsp-off" {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "nvidia_gsp_disabled",
|
|
Severity: "warning",
|
|
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
|
|
})
|
|
}
|
|
}
|
|
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
|
if !health.DriverReady {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "nvidia_kernel_module_missing",
|
|
Severity: "warning",
|
|
Description: "NVIDIA kernel module is not loaded.",
|
|
})
|
|
}
|
|
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "nvidia_modeset_failed",
|
|
Severity: "warning",
|
|
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
|
})
|
|
}
|
|
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
|
health.DriverReady = true
|
|
}
|
|
|
|
if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
|
|
out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
|
if err == nil {
|
|
health.CUDAReady = true
|
|
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "cuda_runtime_not_ready",
|
|
Severity: "warning",
|
|
Description: "CUDA runtime is not ready for GPU SAT.",
|
|
})
|
|
}
|
|
}
|
|
case "amd":
|
|
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
|
|
if !health.DriverReady {
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "amdgpu_kernel_module_missing",
|
|
Severity: "warning",
|
|
Description: "AMD GPU driver is not loaded.",
|
|
})
|
|
}
|
|
|
|
out, err := runROCmSMI("--showproductname", "--csv")
|
|
if err == nil && strings.TrimSpace(string(out)) != "" {
|
|
health.CUDAReady = true
|
|
health.DriverReady = true
|
|
return
|
|
}
|
|
|
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
Code: "rocm_smi_unavailable",
|
|
Severity: "warning",
|
|
Description: "ROCm SMI is not available for AMD GPU SAT.",
|
|
})
|
|
}
|
|
}
|