From 0b8a2ff83feea00b98eae34ca0b87adcd81ace00 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Thu, 30 Apr 2026 10:47:08 +0300 Subject: [PATCH] Add validate test matrix and GPU test methodology docs Co-Authored-By: Claude Sonnet 4.6 --- bible-local/README.md | 57 +++++++++++++++++++ .../docs/customer-gpu-test-methodology.md | 54 ++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 bible-local/docs/customer-gpu-test-methodology.md diff --git a/bible-local/README.md b/bible-local/README.md index c75a83b..b939942 100644 --- a/bible-local/README.md +++ b/bible-local/README.md @@ -9,5 +9,62 @@ Generic engineering rules live in `bible/rules/patterns/`. |---|---| | `architecture/system-overview.md` | What bee does, scope, tech stack | | `architecture/runtime-flows.md` | Boot sequence, audit flow, service order | +| `docs/customer-gpu-test-methodology.md` | Customer-facing GPU PCIe Validate / Validate -> Stress test list | | `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract | +| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy | | `decisions/` | Architectural decision log, including read-only submodule policy | + +## Validate Test Matrix + +### Validate + +- CPU check + - `lscpu` + - `sensors` + - `stress-ng` +- Memory check + - `free` + - `timeout memtester` + - `free` +- NVMe storage check + - `nvme id-ctrl` + - `nvme smart-log` + - `nvme device-self-test` +- SATA/SAS storage check + - `smartctl -H -A` + - `smartctl -t short` +- Basic NVIDIA GPU check + - `nvidia-smi -pm 1` + - `nvidia-smi -q` + - `dmidecode -t baseboard` + - `dmidecode -t system` + - `dcgmi diag -r 2` +- Inter-GPU communication check + - `all_reduce_perf` +- GPU bandwidth check + - `dcgmi diag -r nvbandwidth` + +### Validate -> Stress + +- Extended NVIDIA GPU check + - `nvidia-smi -pm 1` + - `nvidia-smi -q` + - `dmidecode -t baseboard` + - `dmidecode -t system` + - `dcgmi diag -r 3` +- NVIDIA targeted stress + - `nvidia-smi -pm 1` + - `nvidia-smi -q` + - `dcgmi diag -r targeted_stress` +- NVIDIA targeted power + - `nvidia-smi -pm 1` + - `nvidia-smi -q` + - `dcgmi diag -r targeted_power` +- NVIDIA pulse test + - `nvidia-smi -pm 1` + - `nvidia-smi -q` + - `dcgmi diag -r pulse_test` +- Inter-GPU communication check + - `all_reduce_perf` +- GPU bandwidth check + - `dcgmi diag -r nvbandwidth` diff --git a/bible-local/docs/customer-gpu-test-methodology.md b/bible-local/docs/customer-gpu-test-methodology.md new file mode 100644 index 0000000..0c1f160 --- /dev/null +++ b/bible-local/docs/customer-gpu-test-methodology.md @@ -0,0 +1,54 @@ +# GPU PCIe Test Methodology + +## Validate + +- CPU check + - `lscpu` + - `sensors` + - `stress-ng` +- Memory check + - `free` + - `timeout memtester` + - `free` +- NVMe storage check + - `nvme id-ctrl` + - `nvme smart-log` + - `nvme device-self-test` +- SATA/SAS storage check + - `smartctl -H -A` + - `smartctl -t short` +- Basic NVIDIA GPU check + - `nvidia-smi -pm 1` + - `nvidia-smi -q` + - `dmidecode -t baseboard` + - `dmidecode -t system` + - `dcgmi diag -r 2` +- Inter-GPU communication check + - `all_reduce_perf` +- GPU bandwidth check + - `dcgmi diag -r nvbandwidth` + +## Validate -> Stress + +- Extended NVIDIA GPU check + - `nvidia-smi -pm 1` + - `nvidia-smi -q` + - `dmidecode -t baseboard` + - `dmidecode -t system` + - `dcgmi diag -r 3` +- NVIDIA targeted stress + - `nvidia-smi -pm 1` + - `nvidia-smi -q` + - `dcgmi diag -r targeted_stress` +- NVIDIA targeted power + - `nvidia-smi -pm 1` + - `nvidia-smi -q` + - `dcgmi diag -r targeted_power` +- NVIDIA pulse test + - `nvidia-smi -pm 1` + - `nvidia-smi -q` + - `dcgmi diag -r pulse_test` +- Inter-GPU communication check + - `all_reduce_perf` +- GPU bandwidth check + - `dcgmi diag -r nvbandwidth`