Fix Runtime Health criteria: network, services, nvidia-fabricmanager

Network: green if at least one interface has IPv4 (drop PARTIAL state).

Bee Services: treat inactive as OK — oneshot services (bee-sshsetup,
bee-preflight, bee-network, bee-audit, etc.) complete successfully and
exit to inactive; only failed is a real problem.

nvidia-fabricmanager: add ExecCondition=bee-check-nvswitch drop-in so
the service is silently skipped (inactive, not failed) on systems
without NVSwitch hardware (e.g. H200 NVL with direct NVLink, no
NVSwitch chips). bee-check-nvswitch detects NVSwitch via lspci
(vendor 10de, class 0680).

bee-nvidia.service: add ConditionPathExists=/usr/local/bin/bee-nvidia-load
so the unit is a no-op if somehow present in a non-nvidia build.

bee-boot-status: read /etc/bee-gpu-vendor and exclude bee-nvidia from
CRITICAL/ALL on non-nvidia builds, preventing boot hang if the unit
is unexpectedly present.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-14 05:20:25 +03:00
parent dc07580adc
commit 4f6579e040
7 changed files with 27 additions and 21 deletions

View File

@@ -55,7 +55,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
if err == nil {
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
hasIPv4 := false
missingIPv4 := false
for _, iface := range interfaces {
outcome := "no_offer"
if len(iface.IPv4) > 0 {
@@ -63,8 +62,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
hasIPv4 = true
} else if strings.EqualFold(iface.State, "DOWN") {
outcome = "link_down"
} else {
missingIPv4 = true
}
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
Name: iface.Name,
@@ -73,17 +70,9 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
Outcome: outcome,
})
}
switch {
case hasIPv4 && !missingIPv4:
if hasIPv4 {
health.NetworkStatus = "OK"
case hasIPv4:
health.NetworkStatus = "PARTIAL"
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "dhcp_partial",
Severity: "warning",
Description: "At least one interface did not obtain IPv4 connectivity.",
})
default:
} else {
health.NetworkStatus = "FAILED"
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "dhcp_failed",

View File

@@ -647,7 +647,7 @@ func buildRuntimeNetworkRow(health schema.RuntimeHealth) runtimeHealthRow {
if status == "" {
status = "UNKNOWN"
}
issue := runtimeIssueDescriptions(health.Issues, "dhcp_partial", "dhcp_failed")
issue := runtimeIssueDescriptions(health.Issues, "dhcp_failed")
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
}
@@ -705,12 +705,12 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
nonActive := make([]string, 0)
for _, svc := range health.Services {
state := strings.TrimSpace(strings.ToLower(svc.Status))
// "activating" and "deactivating" are transient states for oneshot services
// (RemainAfterExit=yes) — the service is running normally, not failed.
// Only "failed" and "inactive" (after services should be running) are problems.
// "inactive" is OK for oneshot services that have completed successfully
// (bee-sshsetup, bee-preflight, bee-audit, bee-network, etc.).
// Only "failed" is a genuine problem.
switch state {
case "active", "activating", "deactivating", "reloading":
// OK — service is running or transitioning normally
case "active", "activating", "deactivating", "reloading", "inactive":
// OK — service is running, transitioning normally, or completed successfully
default:
nonActive = append(nonActive, svc.Name+"="+svc.Status)
}