Refine burn UI and NVIDIA stress flows

This commit is contained in:
2026-04-05 13:43:43 +03:00
parent 25af2df23a
commit 38e79143eb
18 changed files with 825 additions and 229 deletions

View File

@@ -302,6 +302,12 @@ memtest_fail() {
return 0
}
nvidia_runtime_fail() {
msg="$1"
echo "ERROR: ${msg}" >&2
exit 1
}
iso_memtest_present() {
iso_path="$1"
iso_files="$(mktemp)"
@@ -439,6 +445,44 @@ validate_iso_memtest() {
echo "=== memtest validation OK ==="
}
validate_iso_nvidia_runtime() {
iso_path="$1"
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
echo "=== validating NVIDIA runtime in ISO ==="
[ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
squashfs_tmp="$(mktemp)"
squashfs_list="$(mktemp)"
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
}
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
}
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
}
rm -f "$squashfs_tmp" "$squashfs_list"
echo "=== NVIDIA runtime validation OK ==="
}
append_memtest_grub_entry() {
grub_cfg="$1"
[ -f "$grub_cfg" ] || return 1
@@ -1144,6 +1188,7 @@ if [ -f "$ISO_RAW" ]; then
fi
fi
validate_iso_memtest "$ISO_RAW"
validate_iso_nvidia_runtime "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT"
echo ""
echo "=== done (${BEE_GPU_VENDOR}) ==="

View File

@@ -1,6 +1,10 @@
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
# NVIDIA DCGM (Data Center GPU Manager).
# Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
# NVIDIA max-compute recipe. The smoketest/runtime contract treats
# dcgmproftester as required in the LiveCD.
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
# explicitly.
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%

View File

@@ -52,6 +52,31 @@ else
fail "nvidia-smi: NOT FOUND"
fi
if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
ok "dcgmi found: $p"
else
fail "dcgmi: NOT FOUND"
fi
if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
ok "nv-hostengine found: $p"
else
fail "nv-hostengine: NOT FOUND"
fi
DCGM_PROFTESTER=""
for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
DCGM_PROFTESTER="$p"
break
fi
done
if [ -n "$DCGM_PROFTESTER" ]; then
ok "dcgmproftester found: $DCGM_PROFTESTER"
else
fail "dcgmproftester: NOT FOUND"
fi
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p"
@@ -60,6 +85,12 @@ for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf
fi
done
if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
ok "nvbandwidth found: $p"
else
warn "nvbandwidth: NOT FOUND"
fi
echo ""
echo "-- NVIDIA modules --"
KO_DIR="/usr/local/lib/nvidia"