Add fabric manager boot and support diagnostics

This commit is contained in:
Mikhail Chusavitin
2026-04-15 16:14:26 +03:00
parent ab3ad77cd6
commit 7237e4d3e4
7 changed files with 65 additions and 0 deletions

View File

@@ -22,6 +22,8 @@ var supportBundleServices = []string{
"bee-selfheal.service",
"bee-selfheal.timer",
"bee-sshsetup.service",
"nvidia-dcgm.service",
"nvidia-fabricmanager.service",
}
var supportBundleCommands = []struct {
@@ -48,6 +50,43 @@ else
fi
`}},
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
if command -v nvidia-smi >/dev/null 2>&1; then
nvidia-smi topo -m 2>&1 || true
else
echo "nvidia-smi not found"
fi
`}},
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
if ! command -v systemctl >/dev/null 2>&1; then
echo "systemctl not found"
exit 0
fi
echo "=== unit files ==="
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
echo
echo "=== active units ==="
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
echo
echo "=== failed units ==="
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
`}},
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
for candidate in \
/usr/bin/nvidia-fabricmanager \
/usr/bin/nv-fabricmanager \
/usr/bin/nvidia-fabricmanagerd \
/usr/bin/nvlsm; do
if [ -e "$candidate" ]; then
echo "=== $candidate ==="
ls -l "$candidate" 2>&1 || true
echo
fi
done
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
echo "no fabric manager binaries found"
fi
`}},
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
if ! command -v lspci >/dev/null 2>&1; then
echo "lspci not found"
@@ -195,6 +234,10 @@ var supportBundleOptionalFiles = []struct {
}{
{name: "system/kern.log", src: "/var/log/kern.log"},
{name: "system/syslog.txt", src: "/var/log/syslog"},
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
}
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"

View File

@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
"bee-audit",
"bee-web",
"bee-sshsetup",
"nvidia-dcgm",
"nvidia-fabricmanager",
}
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {