Refine burn UI and NVIDIA stress flows
This commit is contained in:
@@ -302,6 +302,12 @@ memtest_fail() {
|
||||
return 0
|
||||
}
|
||||
|
||||
nvidia_runtime_fail() {
|
||||
msg="$1"
|
||||
echo "ERROR: ${msg}" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iso_memtest_present() {
|
||||
iso_path="$1"
|
||||
iso_files="$(mktemp)"
|
||||
@@ -439,6 +445,44 @@ validate_iso_memtest() {
|
||||
echo "=== memtest validation OK ==="
|
||||
}
|
||||
|
||||
validate_iso_nvidia_runtime() {
|
||||
iso_path="$1"
|
||||
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
||||
|
||||
echo "=== validating NVIDIA runtime in ISO ==="
|
||||
|
||||
[ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
|
||||
require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
|
||||
command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
|
||||
|
||||
squashfs_tmp="$(mktemp)"
|
||||
squashfs_list="$(mktemp)"
|
||||
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
|
||||
}
|
||||
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
|
||||
}
|
||||
|
||||
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
|
||||
}
|
||||
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
|
||||
}
|
||||
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
|
||||
}
|
||||
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
echo "=== NVIDIA runtime validation OK ==="
|
||||
}
|
||||
|
||||
append_memtest_grub_entry() {
|
||||
grub_cfg="$1"
|
||||
[ -f "$grub_cfg" ] || return 1
|
||||
@@ -1144,6 +1188,7 @@ if [ -f "$ISO_RAW" ]; then
|
||||
fi
|
||||
fi
|
||||
validate_iso_memtest "$ISO_RAW"
|
||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
|
||||
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
|
||||
# NVIDIA DCGM (Data Center GPU Manager).
|
||||
# Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
|
||||
# NVIDIA max-compute recipe. The smoketest/runtime contract treats
|
||||
# dcgmproftester as required in the LiveCD.
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||
# explicitly.
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||
|
||||
@@ -52,6 +52,31 @@ else
|
||||
fail "nvidia-smi: NOT FOUND"
|
||||
fi
|
||||
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
|
||||
ok "dcgmi found: $p"
|
||||
else
|
||||
fail "dcgmi: NOT FOUND"
|
||||
fi
|
||||
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
|
||||
ok "nv-hostengine found: $p"
|
||||
else
|
||||
fail "nv-hostengine: NOT FOUND"
|
||||
fi
|
||||
|
||||
DCGM_PROFTESTER=""
|
||||
for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
DCGM_PROFTESTER="$p"
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ -n "$DCGM_PROFTESTER" ]; then
|
||||
ok "dcgmproftester found: $DCGM_PROFTESTER"
|
||||
else
|
||||
fail "dcgmproftester: NOT FOUND"
|
||||
fi
|
||||
|
||||
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
ok "$tool found: $p"
|
||||
@@ -60,6 +85,12 @@ for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf
|
||||
fi
|
||||
done
|
||||
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
|
||||
ok "nvbandwidth found: $p"
|
||||
else
|
||||
warn "nvbandwidth: NOT FOUND"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- NVIDIA modules --"
|
||||
KO_DIR="/usr/local/lib/nvidia"
|
||||
|
||||
Reference in New Issue
Block a user