Fix Runtime Health criteria: network, services, nvidia-fabricmanager
Network: green if at least one interface has IPv4 (drop PARTIAL state). Bee Services: treat inactive as OK — oneshot services (bee-sshsetup, bee-preflight, bee-network, bee-audit, etc.) complete successfully and exit to inactive; only failed is a real problem. nvidia-fabricmanager: add ExecCondition=bee-check-nvswitch drop-in so the service is silently skipped (inactive, not failed) on systems without NVSwitch hardware (e.g. H200 NVL with direct NVLink, no NVSwitch chips). bee-check-nvswitch detects NVSwitch via lspci (vendor 10de, class 0680). bee-nvidia.service: add ConditionPathExists=/usr/local/bin/bee-nvidia-load so the unit is a no-op if somehow present in a non-nvidia build. bee-boot-status: read /etc/bee-gpu-vendor and exclude bee-nvidia from CRITICAL/ALL on non-nvidia builds, preventing boot hang if the unit is unexpectedly present. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -55,7 +55,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
||||||
hasIPv4 := false
|
hasIPv4 := false
|
||||||
missingIPv4 := false
|
|
||||||
for _, iface := range interfaces {
|
for _, iface := range interfaces {
|
||||||
outcome := "no_offer"
|
outcome := "no_offer"
|
||||||
if len(iface.IPv4) > 0 {
|
if len(iface.IPv4) > 0 {
|
||||||
@@ -63,8 +62,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
hasIPv4 = true
|
hasIPv4 = true
|
||||||
} else if strings.EqualFold(iface.State, "DOWN") {
|
} else if strings.EqualFold(iface.State, "DOWN") {
|
||||||
outcome = "link_down"
|
outcome = "link_down"
|
||||||
} else {
|
|
||||||
missingIPv4 = true
|
|
||||||
}
|
}
|
||||||
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
||||||
Name: iface.Name,
|
Name: iface.Name,
|
||||||
@@ -73,17 +70,9 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
Outcome: outcome,
|
Outcome: outcome,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
switch {
|
if hasIPv4 {
|
||||||
case hasIPv4 && !missingIPv4:
|
|
||||||
health.NetworkStatus = "OK"
|
health.NetworkStatus = "OK"
|
||||||
case hasIPv4:
|
} else {
|
||||||
health.NetworkStatus = "PARTIAL"
|
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
||||||
Code: "dhcp_partial",
|
|
||||||
Severity: "warning",
|
|
||||||
Description: "At least one interface did not obtain IPv4 connectivity.",
|
|
||||||
})
|
|
||||||
default:
|
|
||||||
health.NetworkStatus = "FAILED"
|
health.NetworkStatus = "FAILED"
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
Code: "dhcp_failed",
|
Code: "dhcp_failed",
|
||||||
|
|||||||
@@ -647,7 +647,7 @@ func buildRuntimeNetworkRow(health schema.RuntimeHealth) runtimeHealthRow {
|
|||||||
if status == "" {
|
if status == "" {
|
||||||
status = "UNKNOWN"
|
status = "UNKNOWN"
|
||||||
}
|
}
|
||||||
issue := runtimeIssueDescriptions(health.Issues, "dhcp_partial", "dhcp_failed")
|
issue := runtimeIssueDescriptions(health.Issues, "dhcp_failed")
|
||||||
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
|
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -705,12 +705,12 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
|
|||||||
nonActive := make([]string, 0)
|
nonActive := make([]string, 0)
|
||||||
for _, svc := range health.Services {
|
for _, svc := range health.Services {
|
||||||
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
||||||
// "activating" and "deactivating" are transient states for oneshot services
|
// "inactive" is OK for oneshot services that have completed successfully
|
||||||
// (RemainAfterExit=yes) — the service is running normally, not failed.
|
// (bee-sshsetup, bee-preflight, bee-audit, bee-network, etc.).
|
||||||
// Only "failed" and "inactive" (after services should be running) are problems.
|
// Only "failed" is a genuine problem.
|
||||||
switch state {
|
switch state {
|
||||||
case "active", "activating", "deactivating", "reloading":
|
case "active", "activating", "deactivating", "reloading", "inactive":
|
||||||
// OK — service is running or transitioning normally
|
// OK — service is running, transitioning normally, or completed successfully
|
||||||
default:
|
default:
|
||||||
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -69,6 +69,7 @@ chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
|||||||
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gui-gate 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gui-gate 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-check-nvswitch 2>/dev/null || true
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
Description=Bee: load NVIDIA kernel modules and create device nodes
|
Description=Bee: load NVIDIA kernel modules and create device nodes
|
||||||
After=local-fs.target udev.service bee-blackbox.service
|
After=local-fs.target udev.service bee-blackbox.service
|
||||||
Before=bee-audit.service
|
Before=bee-audit.service
|
||||||
|
# Skip silently if bee-nvidia-load is absent (non-nvidia builds).
|
||||||
|
ConditionPathExists=/usr/local/bin/bee-nvidia-load
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
|
|||||||
@@ -0,0 +1,4 @@
|
|||||||
|
[Service]
|
||||||
|
# Skip fabricmanager on systems without NVSwitch hardware.
|
||||||
|
# ExecCondition exits 1-254 → unit is silently skipped (inactive, not failed).
|
||||||
|
ExecCondition=/usr/local/bin/bee-check-nvswitch
|
||||||
@@ -3,8 +3,14 @@
|
|||||||
# Shows live service status until all bee services are done or failed,
|
# Shows live service status until all bee services are done or failed,
|
||||||
# then exits so getty can show the login prompt.
|
# then exits so getty can show the login prompt.
|
||||||
|
|
||||||
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
GPU_VENDOR="$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)"
|
||||||
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
||||||
|
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
||||||
|
else
|
||||||
|
CRITICAL="bee-preflight bee-audit"
|
||||||
|
ALL="bee-sshsetup ssh bee-network bee-preflight bee-audit bee-web"
|
||||||
|
fi
|
||||||
|
|
||||||
svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
|
svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
|
||||||
|
|
||||||
|
|||||||
4
iso/overlay/usr/local/bin/bee-check-nvswitch
Normal file
4
iso/overlay/usr/local/bin/bee-check-nvswitch
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# Exit 0 if NVSwitch hardware is detected; exit 1 to skip fabricmanager on non-NVSwitch systems.
|
||||||
|
# NVSwitch appears in lspci as vendor 10de, class 0680 (Bridge, Other).
|
||||||
|
lspci -Dn 2>/dev/null | awk '$2 == "0680:" && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
Reference in New Issue
Block a user