Compare commits
117 Commits
d952e10dbb
...
v2.6
| Author | SHA1 | Date | |
|---|---|---|---|
| b4371e291e | |||
| c22b53a406 | |||
| ff0acc3698 | |||
| d50760e7c6 | |||
| ed4f8be019 | |||
| 883592d029 | |||
| a6dcaf1c7e | |||
| 88727fb590 | |||
| c9f5224c42 | |||
| 7cb5c02a9b | |||
| c1aa3cf491 | |||
| f7eb75c57c | |||
| 004cc4910d | |||
| ed1cceed8c | |||
| 9fe9f061f8 | |||
| 837a1fb981 | |||
| 1f43b4e050 | |||
| 83bbc8a1bc | |||
| 896bdb6ee8 | |||
| 5407c26e25 | |||
| 4fddaba9c5 | |||
| d2f384b6eb | |||
| 25f0f30aaf | |||
| a57b037a91 | |||
| 5644231f9a | |||
| eea98e6d76 | |||
| 967455194c | |||
| 79dabf3efb | |||
| 1336f5b95c | |||
| 31486a31c1 | |||
| aa3fc332ba | |||
| 62c57b87f2 | |||
| f600261546 | |||
| d7ca04bdfb | |||
| 5433652c70 | |||
| b25f014dbd | |||
| d69a46f211 | |||
|
|
fc5c2019aa | ||
|
|
67a215c66f | ||
|
|
8b4bfdf5ad | ||
|
|
0a52a4f3ba | ||
|
|
b132f7973a | ||
|
|
bd94b6c792 | ||
|
|
06017eddfd | ||
|
|
0ac7b6a963 | ||
|
|
3d2ae4cdcb | ||
|
|
4669f14f4f | ||
|
|
540a9e39b8 | ||
|
|
58510207fa | ||
|
|
4cd7c9ab4e | ||
|
|
cfe255f6e4 | ||
|
|
8b9d3447d7 | ||
|
|
614b7cad61 | ||
|
|
9a1df9b1ba | ||
|
|
30cf014d58 | ||
|
|
27d478aed6 | ||
|
|
d36e8442a9 | ||
|
|
b345b0d14d | ||
|
|
0a1ac2ab9f | ||
|
|
1e62f828c6 | ||
|
|
f8c997d272 | ||
|
|
0c16616cc9 | ||
|
|
adcc147b32 | ||
|
|
94e233651e | ||
|
|
03c36f6cb2 | ||
|
|
a221814797 | ||
|
|
b6619d5ccc | ||
|
|
450193b063 | ||
|
|
ee8931f171 | ||
|
|
b771d95894 | ||
|
|
8e60e474dc | ||
|
|
2f4ec2acda | ||
|
|
7ed5cb0306 | ||
|
|
6df7ac68f5 | ||
|
|
0ce23aea4f | ||
|
|
36dff6e584 | ||
|
|
1c80906c1f | ||
|
|
2abe2ce3aa | ||
|
|
8233c9ee85 | ||
|
|
13189e2683 | ||
|
|
76a17937f3 | ||
|
|
b965184e71 | ||
|
|
b25a2f6d30 | ||
|
|
d18cde19c1 | ||
|
|
78c6dfc0ef | ||
|
|
72cf482ad3 | ||
|
|
a6023372b1 | ||
|
|
ab5a4be7ac | ||
|
|
b8c235b5ac | ||
|
|
b483e2ce35 | ||
|
|
17f0bda45e | ||
|
|
591164a251 | ||
|
|
ef4ec5695d | ||
|
|
f1e096cabe | ||
|
|
6082c7953e | ||
|
|
f37ef0d844 | ||
|
|
e32fa6e477 | ||
|
|
20118bb400 | ||
|
|
55d6876297 | ||
|
|
e8e176ab7f | ||
|
|
caeafa836b | ||
|
|
e8a52562e7 | ||
|
|
6aca1682b9 | ||
|
|
b7c888edb1 | ||
|
|
17d5d74a8d | ||
|
|
d487e539bb | ||
|
|
441ab3adbd | ||
|
|
c91c8d8cf9 | ||
|
|
83e1910281 | ||
|
|
2252c5af56 | ||
|
|
7a4d75c143 | ||
|
|
7c62d100d4 | ||
|
|
c843ff95a2 | ||
|
|
0057686769 | ||
|
|
68b5e02a74 | ||
|
|
fa553c3f20 | ||
|
|
345a93512a |
@@ -1 +1,2 @@
|
|||||||
BUILDER_HOST=
|
BUILDER_HOST=
|
||||||
|
BUILDER_USER=
|
||||||
|
|||||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -1,3 +1,6 @@
|
|||||||
[submodule "bible"]
|
[submodule "bible"]
|
||||||
path = bible
|
path = bible
|
||||||
url = https://git.mchus.pro/mchus/bible.git
|
url = https://git.mchus.pro/mchus/bible.git
|
||||||
|
[submodule "internal/chart"]
|
||||||
|
path = internal/chart
|
||||||
|
url = https://git.mchus.pro/reanimator/chart.git
|
||||||
|
|||||||
395
PLAN.md
395
PLAN.md
@@ -4,13 +4,13 @@ Hardware audit LiveCD for offline server inventory.
|
|||||||
Produces `HardwareIngestRequest` JSON compatible with core/reanimator.
|
Produces `HardwareIngestRequest` JSON compatible with core/reanimator.
|
||||||
|
|
||||||
**Principle:** OS-level collection — reads hardware directly, not through BMC.
|
**Principle:** OS-level collection — reads hardware directly, not through BMC.
|
||||||
Fully unattended — no user interaction required at any stage. Boot → update → audit → output → done.
|
Automatic boot audit plus operator console. Boot runs audit immediately, but local/SSH operators can rerun checks through the TUI and CLI.
|
||||||
All errors are logged, never presented interactively. Every failure path has a silent fallback.
|
Errors are logged and should not block boot on partial collector failures.
|
||||||
Fills the gaps where logpile/Redfish is blind: NVMe, DIMM serials, GPU serials, physical disks behind RAID, full SMART, NIC firmware.
|
Fills the gaps where logpile/Redfish is blind: NVMe, DIMM serials, GPU serials, physical disks behind RAID, full SMART, NIC firmware.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Status snapshot (2026-03-06)
|
## Status snapshot (2026-03-14)
|
||||||
|
|
||||||
### Phase 1 — Go Audit Binary
|
### Phase 1 — Go Audit Binary
|
||||||
|
|
||||||
@@ -23,33 +23,38 @@ Fills the gaps where logpile/Redfish is blind: NVMe, DIMM serials, GPU serials,
|
|||||||
- 1.7 PSU collector — **DONE (basic FRU path)**
|
- 1.7 PSU collector — **DONE (basic FRU path)**
|
||||||
- 1.8 NVIDIA GPU enrichment — **DONE**
|
- 1.8 NVIDIA GPU enrichment — **DONE**
|
||||||
- 1.8b Component wear / age telemetry — **DONE** (storage + NVMe + NVIDIA + NIC SFP/DOM + NIC packet stats)
|
- 1.8b Component wear / age telemetry — **DONE** (storage + NVMe + NVIDIA + NIC SFP/DOM + NIC packet stats)
|
||||||
|
- 1.8c Storage health verdicts — **DONE** (SMART/NVMe warning/failed status derivation)
|
||||||
- 1.9 Mellanox/NVIDIA NIC enrichment — **DONE** (mstflint + ethtool firmware fallback)
|
- 1.9 Mellanox/NVIDIA NIC enrichment — **DONE** (mstflint + ethtool firmware fallback)
|
||||||
- 1.10 RAID controller enrichment — **DONE (initial multi-tool support)** (storcli + sas2/3ircu + arcconf + ssacli + VROC/mdstat)
|
- 1.10 RAID controller enrichment — **DONE (initial multi-tool support)** (storcli + sas2/3ircu + arcconf + ssacli + VROC/mdstat)
|
||||||
- 1.11 Output and USB write — **DONE** (usb + /tmp fallback)
|
- 1.11 PSU SDR health — **DONE** (`ipmitool sdr` merged with FRU inventory)
|
||||||
|
- 1.11 Output and export workflow — **DONE** (explicit file output + manual removable export via TUI)
|
||||||
- 1.12 Integration test (local) — **DONE** (`scripts/test-local.sh`)
|
- 1.12 Integration test (local) — **DONE** (`scripts/test-local.sh`)
|
||||||
|
|
||||||
### Phase 2 — Alpine LiveCD
|
### Phase 2 — Debian Live ISO
|
||||||
|
|
||||||
- Debug ISO track is active (builder + overlay-debug + OpenRC services + TUI workflow).
|
- Current implementation uses Debian 12 `live-build`, `systemd`, and OpenSSH.
|
||||||
- Production ISO track — **IN PROGRESS**.
|
- Network bring-up on boot — **DONE**
|
||||||
- 2.3 Alpine mkimage profile — **DONE (production profile scaffold)**
|
- Boot services (`bee-network`, `bee-nvidia`, `bee-audit`, `bee-sshsetup`) — **DONE**
|
||||||
- 2.4 Network bring-up on boot — **DONE**
|
- Local console UX (`bee` autologin on `tty1`, `menu` auto-start, TUI privilege escalation via `sudo -n`) — **DONE**
|
||||||
- 2.5 OpenRC boot service (bee-audit) — **DONE** (with explicit bee-nvidia ordering)
|
- VM/debug support (`qemu-guest-agent`, serial console, virtual GPU initramfs modules) — **DONE**
|
||||||
- 2.6 Vendor utilities in overlay — **DONE (fetch script + iso/vendor scaffold)**
|
- Vendor utilities in overlay — **DONE**
|
||||||
- 2.7 Auto-update wiring (USB first, network second) — **PARTIAL** (shell flow done; strict Ed25519 verification intentionally deferred to final stage)
|
- Build metadata + staged overlay injection — **DONE**
|
||||||
- 2.8 Release workflow — **PARTIAL** (production build now injects audit binary, NVIDIA modules/tools, vendor tools, and build metadata)
|
- Builder container cache persisted outside container writable layer — **DONE**
|
||||||
|
- ISO volume label `BEE` — **DONE**
|
||||||
|
- Auto-update flow remains deferred; current focus is deterministic offline audit ISO behavior.
|
||||||
|
- Real-hardware validation remains **PENDING**; current validation is limited to local/libvirt VM boot + service checks.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Phase 1 — Go Audit Binary
|
## Phase 1 — Go Audit Binary
|
||||||
|
|
||||||
Self-contained static binary. Runs on any Linux (including Alpine LiveCD).
|
Self-contained static binary. Runs on any Linux (including the Debian live ISO).
|
||||||
Calls system utilities, parses their output, produces `HardwareIngestRequest` JSON.
|
Calls system utilities, parses their output, produces `HardwareIngestRequest` JSON.
|
||||||
|
|
||||||
### 1.1 — Project scaffold
|
### 1.1 — Project scaffold
|
||||||
|
|
||||||
- `audit/go.mod` — module `bee/audit`
|
- `audit/go.mod` — module `bee/audit`
|
||||||
- `audit/cmd/audit/main.go` — CLI entry point: flags, orchestration, JSON output
|
- `audit/cmd/bee/main.go` — main CLI entry point: subcommands, runtime selection, JSON output
|
||||||
- `audit/internal/schema/` — copy of `HardwareIngestRequest` types from core (no import dependency)
|
- `audit/internal/schema/` — copy of `HardwareIngestRequest` types from core (no import dependency)
|
||||||
- `audit/internal/collector/` — empty package stubs for all collectors
|
- `audit/internal/collector/` — empty package stubs for all collectors
|
||||||
- `const Version = "1.0"` in main
|
- `const Version = "1.0"` in main
|
||||||
@@ -237,305 +242,143 @@ No hardcoded vendor names in detection logic — pure PCI vendor_id map.
|
|||||||
|
|
||||||
Tests: table tests with storcli/sas2ircu text fixtures
|
Tests: table tests with storcli/sas2ircu text fixtures
|
||||||
|
|
||||||
### 1.11 — Output and USB write
|
### 1.11 — Output and export workflow
|
||||||
|
|
||||||
`--output stdout` (default): pretty-printed JSON to stdout
|
`--output stdout` (default): pretty-printed JSON to stdout
|
||||||
`--output file:<path>`: write JSON to explicit path
|
`--output file:<path>`: write JSON to explicit path
|
||||||
`--output usb`: auto-detect first removable block device, mount it, write `audit-<board_serial>-<YYYYMMDD-HHMMSS>.json`
|
|
||||||
|
|
||||||
USB detection: scan `/sys/block/*/removable`, pick first `1`, mount to `/tmp/bee-usb`
|
Live ISO default service output: `/var/log/bee-audit.json`
|
||||||
|
|
||||||
QR summary to stdout (always): board serial + model + component counts — fits in one QR code
|
Removable-media export is manual via `bee tui` (or the LiveCD wrapper `bee-tui`):
|
||||||
Uses `qrencode` if present, else skips silently
|
- operator chooses a removable filesystem explicitly
|
||||||
|
- TUI mounts it if needed
|
||||||
|
- TUI asks for confirmation before copying the JSON
|
||||||
|
- TUI unmounts temporary mountpoints after export
|
||||||
|
|
||||||
|
No auto-write to arbitrary removable media is allowed.
|
||||||
|
|
||||||
### 1.12 — Integration test (local)
|
### 1.12 — Integration test (local)
|
||||||
|
|
||||||
`scripts/test-local.sh` — runs audit binary on developer machine (Linux), captures JSON,
|
`scripts/test-local.sh` — runs `bee audit` on developer machine (Linux), captures JSON,
|
||||||
validates required fields are present (board.serial_number non-empty, cpus non-empty, etc.)
|
validates required fields are present (board.serial_number non-empty, cpus non-empty, etc.)
|
||||||
|
|
||||||
Not a unit test — requires real hardware access. Documents how to run for verification.
|
Not a unit test — requires real hardware access. Documents how to run for verification.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Phase 2 — Alpine LiveCD
|
## Phase 2 — Debian Live ISO
|
||||||
|
|
||||||
ISO image bootable via BMC virtual media. Runs audit binary automatically on boot.
|
ISO image bootable via BMC virtual media or USB. Runs boot services automatically and writes the audit result to `/var/log/bee-audit.json`.
|
||||||
|
|
||||||
### 2.1 — Builder environment
|
### 2.1 — Builder environment
|
||||||
|
|
||||||
`iso/builder/Dockerfile` — Alpine 3.21 build environment with:
|
`iso/builder/build-in-container.sh` is the only supported builder entrypoint.
|
||||||
- `alpine-sdk`, `abuild`, `squashfs-tools`, `xorriso`
|
It builds a Debian 12 builder image with `live-build`, toolchains, and pinned kernel headers,
|
||||||
- Go toolchain (for binary compilation inside builder)
|
then runs the ISO assembly in a privileged container because `live-build` needs
|
||||||
- NVIDIA driver `.run` pre-fetched during image build
|
mount/chroot/loop capabilities.
|
||||||
|
|
||||||
`iso/builder/build.sh` — orchestrates full ISO build:
|
`iso/builder/build.sh` orchestrates the full ISO build:
|
||||||
1. Compile Go binary (static, `CGO_ENABLED=0`)
|
1. compile the Go `bee` binary
|
||||||
2. Compile NVIDIA kernel module against Alpine 3.21 LTS kernel headers
|
2. create a staged overlay under `dist/overlay-stage`
|
||||||
3. Run `mkimage.sh` with bee profile
|
3. inject SSH auth, vendor tools, NVIDIA artifacts, and build metadata into the staged overlay
|
||||||
4. Output: `dist/bee-<version>.iso`
|
4. create a disposable `live-build` workdir under `dist/live-build-work`
|
||||||
|
5. sync the staged overlay into `config/includes.chroot/`
|
||||||
|
6. run `lb config && lb build`
|
||||||
|
7. copy the final ISO into `dist/`
|
||||||
|
|
||||||
### 2.2 — NVIDIA driver build
|
### 2.2 — NVIDIA driver build
|
||||||
|
|
||||||
Alpine 3.21, LTS kernel 6.6 — fixed versions in builder.
|
`iso/builder/build-nvidia-module.sh`:
|
||||||
|
- downloads the pinned NVIDIA `.run` installer
|
||||||
|
- verifies SHA256
|
||||||
|
- builds kernel modules against the pinned Debian kernel ABI
|
||||||
|
- caches modules, userspace tools, and libs in `dist/nvidia-<version>-<kver>/`
|
||||||
|
|
||||||
`iso/builder/build-nvidia.sh`:
|
`iso/overlay/usr/local/bin/bee-nvidia-load`:
|
||||||
- Download `NVIDIA-Linux-x86_64-<ver>.run` (version pinned in `iso/builder/VERSIONS`)
|
- loads `nvidia`, `nvidia-modeset`, `nvidia-uvm` via `insmod`
|
||||||
- Extract kernel module sources
|
- creates `/dev/nvidia*` nodes if the driver registered successfully
|
||||||
- Compile against `linux-lts-dev` headers
|
- logs failures but does not block the rest of boot
|
||||||
- Strip and package as `nvidia-<ver>-k6.6.ko.tar.gz` for inclusion in overlay
|
|
||||||
|
|
||||||
`iso/overlay/usr/local/bin/load-nvidia.sh`:
|
### 2.3 — ISO assembly and overlay policy
|
||||||
- `insmod` sequence: nvidia.ko → nvidia-modeset.ko → nvidia-uvm.ko
|
|
||||||
- Verify: `nvidia-smi -L` → log result
|
|
||||||
- On failure: log warning, continue (audit runs without GPU enrichment)
|
|
||||||
|
|
||||||
### 2.3 — Alpine mkimage profile
|
`iso/overlay/` is source-only input for the build.
|
||||||
|
|
||||||
`iso/builder/mkimg.bee.sh` — Alpine mkimage profile:
|
Build-time files are injected into the staged overlay only:
|
||||||
- Base: `alpine-base`
|
- `bee`
|
||||||
- Kernel: `linux-lts`
|
- `bee-smoketest`
|
||||||
- Packages: `dmidecode smartmontools nvme-cli pciutils ipmitool util-linux e2fsprogs qrencode`
|
- `authorized_keys`
|
||||||
- Overlay: `iso/overlay/` included as apkovl
|
- password-fallback marker
|
||||||
|
- `/etc/bee-release`
|
||||||
|
- vendor tools from `iso/vendor/`
|
||||||
|
|
||||||
### 2.4 — Network bring-up on boot
|
The source tree must stay clean after a build.
|
||||||
|
|
||||||
`iso/overlay/usr/local/bin/bee-network.sh`:
|
### 2.4 — Boot services
|
||||||
- Enumerate all network interfaces: `ip link show` → filter out loopback and virtual (docker/bridge)
|
|
||||||
- For each physical interface: `ip link set <iface> up` + `udhcpc -i <iface> -t 5 -T 3 -n`
|
|
||||||
- Log each interface result (got IP / timeout / no carrier)
|
|
||||||
- Continue regardless — network is best-effort for auto-update
|
|
||||||
|
|
||||||
`iso/overlay/etc/init.d/bee-network`:
|
`systemd` service order:
|
||||||
- runlevel: default, before: bee-update
|
- `bee-sshsetup.service` → configures SSH auth before `ssh.service`
|
||||||
- Calls bee-network.sh
|
- `bee-network.service` → starts best-effort DHCP on all physical interfaces
|
||||||
- Does not block boot if DHCP fails on all interfaces
|
- `bee-nvidia.service` → loads NVIDIA modules if present
|
||||||
|
- `bee-audit.service` → runs audit and logs failures without turning partial collector bugs into a boot blocker
|
||||||
|
|
||||||
### 2.5 — OpenRC boot service (bee-audit)
|
### 2.4b — Runtime split
|
||||||
|
|
||||||
`iso/overlay/etc/init.d/bee-audit`:
|
Target split:
|
||||||
- runlevel: default, after: bee-update
|
- main Go application works on a normal Linux host and on the live ISO
|
||||||
- start(): load-nvidia.sh → /usr/local/bin/audit --output usb
|
- live-ISO specifics stay in integration glue under `iso/`
|
||||||
- on completion: print QR summary to /dev/tty1 (always, even if USB write failed)
|
- the live ISO passes `--runtime livecd` to the Go binary
|
||||||
- log everything to /var/log/bee-audit.log
|
- local runs default to `--runtime auto`, which resolves to `local` unless a live marker is detected
|
||||||
- exits 0 regardless of partial failures — unattended, no prompts, no waits
|
|
||||||
|
|
||||||
Unattended invariants:
|
Planned code shape:
|
||||||
- No TTY prompts ever. All decisions are automatic.
|
- `audit/cmd/bee/` — main CLI entrypoint
|
||||||
- Missing USB: output goes to /tmp/bee-audit-<serial>-<date>.json, QR shown on screen.
|
- `audit/internal/runtimeenv/` — runtime detection and mode selection
|
||||||
- Missing NVIDIA driver: GPU records have status UNKNOWN, audit continues.
|
- future `audit/internal/tui/` — host/live shared TUI logic
|
||||||
- Missing ipmitool/storcli/any tool: that collector is skipped, rest continue.
|
- `iso/overlay/` — boot-time livecd integration only
|
||||||
- Timeout on any external command: 30s hard limit via `timeout` wrapper, then skip.
|
|
||||||
- Boot never hangs waiting for user input.
|
|
||||||
|
|
||||||
`iso/overlay/etc/runlevels/default/bee-audit` symlink
|
### 2.5 — Operator workflows
|
||||||
|
|
||||||
### 2.6 — Vendor utilities in overlay
|
- Automatic boot audit writes JSON to `/var/log/bee-audit.json`
|
||||||
|
- `tty1` autologins into `bee` and auto-runs `menu`
|
||||||
|
- `menu` launches the LiveCD wrapper `bee-tui`, which escalates to `root` via `sudo -n`
|
||||||
|
- `bee tui` can rerun the audit manually
|
||||||
|
- `bee tui` can export the latest audit JSON to removable media
|
||||||
|
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
||||||
|
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
|
||||||
|
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
||||||
|
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
|
||||||
|
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
||||||
|
|
||||||
`iso/overlay/usr/local/bin/` includes pre-fetched proprietary tools:
|
### 2.6 — Vendor utilities and optional assets
|
||||||
- `storcli64` (Broadcom)
|
|
||||||
- `sas2ircu`, `sas3ircu` (Broadcom/LSI)
|
|
||||||
- `mstflint` (NVIDIA Networking / Mellanox)
|
|
||||||
|
|
||||||
`scripts/fetch-vendor.sh` — downloads and places these before ISO build.
|
Optional binaries live in `iso/vendor/` and are included when present:
|
||||||
Checksums verified. Tools not committed to git — fetched at build time.
|
- `storcli64`
|
||||||
|
- `sas2ircu`, `sas3ircu`
|
||||||
|
- `arcconf`
|
||||||
|
- `ssacli`
|
||||||
|
- `mstflint` (via Debian package set)
|
||||||
|
|
||||||
`iso/vendor/.gitkeep` — placeholder, directory gitignored except .gitkeep
|
Missing optional tools do not fail the build or boot.
|
||||||
|
|
||||||
### 2.7 — Auto-update of audit binary (USB + network)
|
### 2.7 — Release workflow
|
||||||
|
|
||||||
Two update paths, tried in order on every boot:
|
`iso/builder/VERSIONS` pins the current release inputs:
|
||||||
|
- audit version
|
||||||
|
- Debian version / kernel ABI
|
||||||
|
- Go version
|
||||||
|
- NVIDIA driver version
|
||||||
|
|
||||||
**Path A — USB (no network required, higher priority):**
|
Current release model:
|
||||||
|
- shipping a new ISO means a full rebuild
|
||||||
`bee-update.sh` scans mounted removable media for an update package before checking network.
|
- build metadata is embedded into `/etc/bee-release` and `motd`
|
||||||
|
- current ISO label is `BEE`
|
||||||
Looks for: `<usb>/bee-update/bee-audit-linux-amd64` + `<usb>/bee-update/bee-audit-linux-amd64.sha256`
|
- binary self-update remains deferred; no automatic USB/network patching is part of the current runtime
|
||||||
|
|
||||||
Steps:
|
|
||||||
1. Find USB mount point (same detection as audit output: `/sys/block/*/removable`)
|
|
||||||
2. Check for `bee-update/bee-audit-linux-amd64` on the USB root
|
|
||||||
3. Read version from `bee-update/VERSION` file (plain text, e.g. `1.3`)
|
|
||||||
4. Compare with running binary version (`/usr/local/bin/audit --version`)
|
|
||||||
5. If USB version > running: verify SHA256 checksum, replace binary, log update
|
|
||||||
6. Re-run audit if updated
|
|
||||||
|
|
||||||
**Authenticity verification — Ed25519 multi-key trust (stdlib only, no external tools):**
|
|
||||||
|
|
||||||
Problem: SHA256 alone does not prevent a crafted attack — an attacker places their binary
|
|
||||||
and a matching SHA256 next to it. The LiveCD would accept it.
|
|
||||||
|
|
||||||
Solution: Ed25519 asymmetric signatures via Go stdlib `crypto/ed25519`.
|
|
||||||
Multiple developer public keys are supported. A binary update is accepted if its signature
|
|
||||||
verifies against ANY of the embedded trusted public keys.
|
|
||||||
|
|
||||||
This mirrors the SSH authorized_keys model: add a developer → add their public key.
|
|
||||||
Remove a developer → rebuild without their key.
|
|
||||||
|
|
||||||
**Key management — centralized across all projects:**
|
|
||||||
|
|
||||||
Public keys live in a dedicated repo at git.mchus.pro/mchus/keys (or similar):
|
|
||||||
```
|
|
||||||
keys/
|
|
||||||
developers/
|
|
||||||
mchusavitin.pub ← Ed25519 public key, base64, one line
|
|
||||||
developer2.pub
|
|
||||||
README.md ← how to generate a key pair
|
|
||||||
```
|
|
||||||
|
|
||||||
Public keys are safe to commit — they are not secret.
|
|
||||||
Private keys stay on each developer's machine, never committed anywhere.
|
|
||||||
|
|
||||||
Key generation (one-time per developer, run locally):
|
|
||||||
```sh
|
|
||||||
# scripts/keygen.sh — also lives in the keys repo
|
|
||||||
openssl genpkey -algorithm ed25519 -out ~/.bee-release.key
|
|
||||||
openssl pkey -in ~/.bee-release.key -pubout -outform DER \
|
|
||||||
| tail -c 32 | base64 > mchusavitin.pub
|
|
||||||
```
|
|
||||||
|
|
||||||
**Embedding public keys at release time (not compile time):**
|
|
||||||
|
|
||||||
Public keys are injected via `-ldflags` at build time from the keys repo.
|
|
||||||
The binary does not hardcode keys — they are provided by the release script.
|
|
||||||
|
|
||||||
```go
|
|
||||||
// audit/internal/updater/trust.go
|
|
||||||
// trustedKeysRaw is injected at build time via -ldflags
|
|
||||||
// format: base64(key1):base64(key2):...
|
|
||||||
var trustedKeysRaw string
|
|
||||||
|
|
||||||
func trustedKeys() ([]ed25519.PublicKey, error) {
|
|
||||||
if trustedKeysRaw == "" {
|
|
||||||
return nil, fmt.Errorf("binary built without trusted keys — updates disabled")
|
|
||||||
}
|
|
||||||
var keys []ed25519.PublicKey
|
|
||||||
for _, enc := range strings.Split(trustedKeysRaw, ":") {
|
|
||||||
b, err := base64.StdEncoding.DecodeString(strings.TrimSpace(enc))
|
|
||||||
if err != nil || len(b) != ed25519.PublicKeySize {
|
|
||||||
return nil, fmt.Errorf("invalid trusted key: %w", err)
|
|
||||||
}
|
|
||||||
keys = append(keys, ed25519.PublicKey(b))
|
|
||||||
}
|
|
||||||
return keys, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func verifySignature(binaryPath, sigPath string) error {
|
|
||||||
keys, err := trustedKeys()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
data, _ := os.ReadFile(binaryPath)
|
|
||||||
sig, _ := os.ReadFile(sigPath) // 64 bytes raw Ed25519 signature
|
|
||||||
for _, key := range keys {
|
|
||||||
if ed25519.Verify(key, data, sig) {
|
|
||||||
return nil // any trusted key accepts → pass
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return fmt.Errorf("signature verification failed: no trusted key matched")
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Release build injects keys:
|
|
||||||
```sh
|
|
||||||
# scripts/build-release.sh
|
|
||||||
KEYS=$(paste -sd: keys/developers/*.pub)
|
|
||||||
go build -ldflags "-X bee/audit/internal/updater/trust.trustedKeysRaw=${KEYS}" \
|
|
||||||
-o dist/bee-audit-linux-amd64 ./cmd/audit
|
|
||||||
```
|
|
||||||
|
|
||||||
Signing (release engineer signs with their private key):
|
|
||||||
```sh
|
|
||||||
# scripts/sign-release.sh <binary>
|
|
||||||
openssl pkeyutl -sign -inkey ~/.bee-release.key \
|
|
||||||
-rawin -in "$1" -out "$1.sig"
|
|
||||||
```
|
|
||||||
|
|
||||||
Binary built without `-ldflags` injection (e.g. local dev build) has `trustedKeysRaw=""`
|
|
||||||
→ updates are disabled, logged as INFO, audit continues normally.
|
|
||||||
|
|
||||||
Update rejected silently (logged as WARNING, audit continues with current binary) if:
|
|
||||||
- `.sig` file missing
|
|
||||||
- Signature does not match any trusted key
|
|
||||||
- `trustedKeysRaw` empty (dev build)
|
|
||||||
|
|
||||||
Update package layout on USB:
|
|
||||||
```
|
|
||||||
/bee-update/
|
|
||||||
bee-audit-linux-amd64 ← new binary (also signed with embedded keys)
|
|
||||||
bee-audit-linux-amd64.sig ← Ed25519 signature (64 bytes raw)
|
|
||||||
VERSION ← plain version string e.g. "1.3"
|
|
||||||
```
|
|
||||||
|
|
||||||
Admin workflow: download `bee-audit-linux-amd64` + `bee-audit-linux-amd64.sig` from Gitea
|
|
||||||
release assets, place in `bee-update/` on USB.
|
|
||||||
|
|
||||||
**Path B — Network (requires DHCP on at least one interface):**
|
|
||||||
1. Check network: ping git.mchus.pro -c 1 -W 3 || skip
|
|
||||||
2. Fetch: `GET https://git.mchus.pro/api/v1/repos/<org>/bee/releases/latest`
|
|
||||||
3. Parse tag_name, asset URLs for `bee-audit-linux-amd64` + `bee-audit-linux-amd64.sig`
|
|
||||||
4. Compare tag with running version
|
|
||||||
5. If newer: download both files to /tmp, verify Ed25519 signature against all trusted keys
|
|
||||||
6. Replace binary on pass, log and skip on fail
|
|
||||||
7. Re-run audit if updated
|
|
||||||
|
|
||||||
**Ordering:** USB update checked first, network checked second.
|
|
||||||
If USB update applied and verified, network check is skipped.
|
|
||||||
|
|
||||||
`iso/overlay/etc/init.d/bee-update`:
|
|
||||||
- runlevel: default
|
|
||||||
- after: bee-network (network path needs interfaces up)
|
|
||||||
- before: bee-audit (audit runs with latest binary)
|
|
||||||
- Calls bee-update.sh
|
|
||||||
|
|
||||||
Triggered after bee-audit completes, only if network is available.
|
|
||||||
|
|
||||||
`iso/overlay/usr/local/bin/bee-update.sh`:
|
|
||||||
|
|
||||||
```
|
|
||||||
1. Check network: ping git.mchus.pro -c 1 -W 3 || exit 0
|
|
||||||
2. Fetch latest release metadata:
|
|
||||||
GET https://git.mchus.pro/api/v1/repos/<org>/bee/releases/latest
|
|
||||||
3. Parse: extract tag_name, asset URL for bee-audit-linux-amd64
|
|
||||||
4. Compare tag_name with /usr/local/bin/audit --version output
|
|
||||||
5. If newer: download to /tmp/bee-audit-new, verify SHA256 checksum from release assets
|
|
||||||
6. Replace /usr/local/bin/audit (tmpfs — survives until reboot)
|
|
||||||
7. Log: updated from vX.Y to vX.Z
|
|
||||||
8. Re-run audit if update happened: /usr/local/bin/audit --output usb
|
|
||||||
```
|
|
||||||
|
|
||||||
`iso/overlay/etc/init.d/bee-update`:
|
|
||||||
- runlevel: default
|
|
||||||
- after: bee-audit, network
|
|
||||||
- Calls bee-update.sh
|
|
||||||
|
|
||||||
Release naming convention: binary asset named `bee-audit-linux-amd64` per release tag.
|
|
||||||
|
|
||||||
### 2.8 — Release workflow
|
|
||||||
|
|
||||||
`iso/builder/VERSIONS` — pinned versions:
|
|
||||||
```
|
|
||||||
AUDIT_VERSION=1.0
|
|
||||||
ALPINE_VERSION=3.21
|
|
||||||
KERNEL_VERSION=6.12
|
|
||||||
NVIDIA_DRIVER_VERSION=590.48.01
|
|
||||||
```
|
|
||||||
|
|
||||||
LiveCD release = full ISO rebuild. Binary-only patch = new Gitea release with binary asset.
|
|
||||||
On boot with network: ISO auto-patches its binary without full rebuild.
|
|
||||||
|
|
||||||
ISO version embedded in `/etc/bee-release`:
|
|
||||||
```
|
|
||||||
BEE_ISO_VERSION=1.0
|
|
||||||
BEE_AUDIT_VERSION=1.0
|
|
||||||
BUILD_DATE=2026-03-05
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Eating order
|
## Eating order
|
||||||
|
|
||||||
Builder environment is set up early (after 1.3) so every subsequent collector
|
Builder environment is set up early (after 1.3) so every subsequent collector
|
||||||
is developed and tested directly on real hardware in the actual Alpine environment.
|
is developed and tested directly on real hardware in the actual Debian live ISO environment.
|
||||||
No "works on my Mac" drift.
|
No "works on my Mac" drift.
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -544,10 +387,10 @@ No "works on my Mac" drift.
|
|||||||
1.2 board collector → first real data
|
1.2 board collector → first real data
|
||||||
1.3 CPU collector → +CPUs
|
1.3 CPU collector → +CPUs
|
||||||
|
|
||||||
--- BUILDER + DEBUG ISO (unblock real-hardware testing) ---
|
--- BUILDER + BEE ISO (unblock real-hardware testing) ---
|
||||||
|
|
||||||
2.1 builder VM setup → Alpine VM with build deps + Go toolchain
|
2.1 builder setup → privileged container with build deps
|
||||||
2.2 debug ISO profile → minimal Alpine ISO: audit binary + dropbear SSH + all packages
|
2.2 debug ISO profile → minimal Debian ISO: `bee` binary + OpenSSH + all packages
|
||||||
2.3 boot on real server → SSH in, verify packages present, run audit manually
|
2.3 boot on real server → SSH in, verify packages present, run audit manually
|
||||||
|
|
||||||
--- CONTINUE COLLECTORS (tested on real hardware from here) ---
|
--- CONTINUE COLLECTORS (tested on real hardware from here) ---
|
||||||
@@ -560,14 +403,14 @@ No "works on my Mac" drift.
|
|||||||
1.8b wear/age telemetry → +SMART hours, NVMe % used, SFP DOM, ECC
|
1.8b wear/age telemetry → +SMART hours, NVMe % used, SFP DOM, ECC
|
||||||
1.9 Mellanox NIC enrichment → +NIC firmware/serial
|
1.9 Mellanox NIC enrichment → +NIC firmware/serial
|
||||||
1.10 RAID enrichment → +physical disks behind RAID
|
1.10 RAID enrichment → +physical disks behind RAID
|
||||||
1.11 output + USB write → production-ready output
|
1.11 output + export workflow → file output + explicit removable export
|
||||||
|
|
||||||
--- PRODUCTION ISO ---
|
--- PRODUCTION ISO ---
|
||||||
|
|
||||||
2.4 NVIDIA driver build → driver compiled into overlay
|
2.4 NVIDIA driver build → driver compiled into overlay
|
||||||
2.5 network bring-up on boot → DHCP on all interfaces
|
2.5 network bring-up on boot → DHCP on all interfaces
|
||||||
2.6 OpenRC boot service → audit runs on boot automatically
|
2.6 systemd boot service → audit runs on boot automatically
|
||||||
2.7 vendor utilities → storcli/sas2ircu/mstflint in image
|
2.7 vendor utilities → storcli/sas2ircu/arcconf/ssacli in image
|
||||||
2.8 auto-update → binary self-patches from Gitea
|
2.8 release workflow → versioning + release notes
|
||||||
2.9 release workflow → versioning + release notes
|
2.9 operator export flow → explicit TUI export to removable media
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -1,167 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"encoding/json"
|
|
||||||
"flag"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"os/exec"
|
|
||||||
"path/filepath"
|
|
||||||
"sort"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"bee/audit/internal/collector"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Version is the audit binary version.
|
|
||||||
// Injected at release build time via:
|
|
||||||
//
|
|
||||||
// -ldflags "-X main.Version=1.2"
|
|
||||||
//
|
|
||||||
// Defaults to "dev" in local builds.
|
|
||||||
var Version = "dev"
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
output := flag.String("output", "stdout", `output destination:
|
|
||||||
stdout — print JSON to stdout (default)
|
|
||||||
file:<path> — write JSON to file
|
|
||||||
usb — auto-detect removable media, write JSON there`)
|
|
||||||
showVersion := flag.Bool("version", false, "print version and exit")
|
|
||||||
flag.Parse()
|
|
||||||
|
|
||||||
if *showVersion {
|
|
||||||
fmt.Println(Version)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
|
||||||
Level: slog.LevelInfo,
|
|
||||||
})))
|
|
||||||
|
|
||||||
result := collector.Run()
|
|
||||||
|
|
||||||
data, err := json.MarshalIndent(result, "", " ")
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("marshal result", "err", err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := writeOutput(*output, data); err != nil {
|
|
||||||
slog.Error("write output", "destination", *output, "err", err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func writeOutput(dest string, data []byte) error {
|
|
||||||
switch {
|
|
||||||
case dest == "stdout":
|
|
||||||
_, err := os.Stdout.Write(append(data, '\n'))
|
|
||||||
return err
|
|
||||||
|
|
||||||
case strings.HasPrefix(dest, "file:"):
|
|
||||||
path := strings.TrimPrefix(dest, "file:")
|
|
||||||
return os.WriteFile(path, append(data, '\n'), 0644)
|
|
||||||
|
|
||||||
case dest == "usb":
|
|
||||||
return writeToUSB(data)
|
|
||||||
|
|
||||||
default:
|
|
||||||
return fmt.Errorf("unknown output destination %q — use stdout, file:<path>, or usb", dest)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// writeToUSB auto-detects the first removable block device, mounts it,
|
|
||||||
// and writes the audit JSON. Falls back to /tmp on any failure.
|
|
||||||
func writeToUSB(data []byte) error {
|
|
||||||
boardSerial := extractBoardSerial(data)
|
|
||||||
filename := auditFilename(boardSerial, time.Now().UTC())
|
|
||||||
|
|
||||||
device, err := firstRemovableDevice()
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("usb output: no removable device, writing to /tmp", "err", err)
|
|
||||||
return writeAuditToPath(filepath.Join("/tmp", filename), data)
|
|
||||||
}
|
|
||||||
|
|
||||||
mountpoint := "/tmp/bee-usb"
|
|
||||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := exec.Command("mount", device, mountpoint).Run(); err != nil {
|
|
||||||
slog.Warn("usb output: mount failed, writing to /tmp", "device", device, "err", err)
|
|
||||||
return writeAuditToPath(filepath.Join("/tmp", filename), data)
|
|
||||||
}
|
|
||||||
defer func() {
|
|
||||||
if err := exec.Command("umount", mountpoint).Run(); err != nil {
|
|
||||||
slog.Warn("usb output: umount failed", "mountpoint", mountpoint, "err", err)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
path := filepath.Join(mountpoint, filename)
|
|
||||||
if err := writeAuditToPath(path, data); err != nil {
|
|
||||||
slog.Warn("usb output: write failed, falling back to /tmp", "path", path, "err", err)
|
|
||||||
return writeAuditToPath(filepath.Join("/tmp", filename), data)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Info("usb output: written", "path", path)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func writeAuditToPath(path string, data []byte) error {
|
|
||||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
slog.Info("audit output written", "path", path)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func extractBoardSerial(data []byte) string {
|
|
||||||
var doc struct {
|
|
||||||
Hardware struct {
|
|
||||||
Board struct {
|
|
||||||
SerialNumber string `json:"serial_number"`
|
|
||||||
} `json:"board"`
|
|
||||||
} `json:"hardware"`
|
|
||||||
}
|
|
||||||
if err := json.Unmarshal(data, &doc); err != nil {
|
|
||||||
return "unknown"
|
|
||||||
}
|
|
||||||
serial := strings.TrimSpace(doc.Hardware.Board.SerialNumber)
|
|
||||||
if serial == "" {
|
|
||||||
return "unknown"
|
|
||||||
}
|
|
||||||
return serial
|
|
||||||
}
|
|
||||||
|
|
||||||
func auditFilename(boardSerial string, now time.Time) string {
|
|
||||||
boardSerial = strings.TrimSpace(boardSerial)
|
|
||||||
if boardSerial == "" {
|
|
||||||
boardSerial = "unknown"
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("audit-%s-%s.json", boardSerial, now.Format("20060102-150405"))
|
|
||||||
}
|
|
||||||
|
|
||||||
func firstRemovableDevice() (string, error) {
|
|
||||||
entries, err := os.ReadDir("/sys/block")
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
sort.Slice(entries, func(i, j int) bool { return entries[i].Name() < entries[j].Name() })
|
|
||||||
|
|
||||||
for _, e := range entries {
|
|
||||||
name := e.Name()
|
|
||||||
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
removableFlag, err := os.ReadFile(filepath.Join("/sys/block", name, "removable"))
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(string(removableFlag)) == "1" {
|
|
||||||
return filepath.Join("/dev", name), nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return "", fmt.Errorf("no removable block device found")
|
|
||||||
}
|
|
||||||
369
audit/cmd/bee/main.go
Normal file
369
audit/cmd/bee/main.go
Normal file
@@ -0,0 +1,369 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/runtimeenv"
|
||||||
|
"bee/audit/internal/webui"
|
||||||
|
)
|
||||||
|
|
||||||
|
var Version = "dev"
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||||
|
}
|
||||||
|
|
||||||
|
func run(args []string, stdout, stderr io.Writer) int {
|
||||||
|
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||||
|
Level: slog.LevelInfo,
|
||||||
|
})))
|
||||||
|
|
||||||
|
if len(args) == 0 {
|
||||||
|
printRootUsage(stderr)
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
switch args[0] {
|
||||||
|
case "help", "--help", "-h":
|
||||||
|
if len(args) > 1 {
|
||||||
|
return runHelp(args[1:], stdout, stderr)
|
||||||
|
}
|
||||||
|
printRootUsage(stdout)
|
||||||
|
return 0
|
||||||
|
case "audit":
|
||||||
|
return runAudit(args[1:], stdout, stderr)
|
||||||
|
case "export":
|
||||||
|
return runExport(args[1:], stdout, stderr)
|
||||||
|
case "preflight":
|
||||||
|
return runPreflight(args[1:], stdout, stderr)
|
||||||
|
case "support-bundle":
|
||||||
|
return runSupportBundle(args[1:], stdout, stderr)
|
||||||
|
case "web":
|
||||||
|
return runWeb(args[1:], stdout, stderr)
|
||||||
|
case "sat":
|
||||||
|
return runSAT(args[1:], stdout, stderr)
|
||||||
|
case "version", "--version", "-version":
|
||||||
|
fmt.Fprintln(stdout, Version)
|
||||||
|
return 0
|
||||||
|
default:
|
||||||
|
fmt.Fprintf(stderr, "bee: unknown command %q\n\n", args[0])
|
||||||
|
printRootUsage(stderr)
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func printRootUsage(w io.Writer) {
|
||||||
|
fmt.Fprintln(w, `bee commands:
|
||||||
|
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
||||||
|
bee preflight --output stdout|file:<path>
|
||||||
|
bee export --target <device>
|
||||||
|
bee support-bundle --output stdout|file:<path>
|
||||||
|
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
|
||||||
|
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||||
|
bee version
|
||||||
|
bee help [command]`)
|
||||||
|
}
|
||||||
|
|
||||||
|
func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||||
|
switch args[0] {
|
||||||
|
case "audit":
|
||||||
|
return runAudit([]string{"--help"}, stdout, stdout)
|
||||||
|
case "export":
|
||||||
|
return runExport([]string{"--help"}, stdout, stdout)
|
||||||
|
case "preflight":
|
||||||
|
return runPreflight([]string{"--help"}, stdout, stdout)
|
||||||
|
case "support-bundle":
|
||||||
|
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||||
|
case "web":
|
||||||
|
return runWeb([]string{"--help"}, stdout, stdout)
|
||||||
|
case "sat":
|
||||||
|
return runSAT([]string{"--help"}, stdout, stderr)
|
||||||
|
case "version":
|
||||||
|
fmt.Fprintln(stdout, "usage: bee version")
|
||||||
|
return 0
|
||||||
|
default:
|
||||||
|
fmt.Fprintf(stderr, "bee help: unknown command %q\n\n", args[0])
|
||||||
|
printRootUsage(stderr)
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func runAudit(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("audit", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
|
||||||
|
runtimeFlag := fs.String("runtime", "auto", "runtime environment: auto, local, livecd")
|
||||||
|
showVersion := fs.Bool("version", false, "print version and exit")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintln(stderr, "usage: bee audit [--runtime auto|local|livecd] [--output stdout|file:<path>]")
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if *showVersion {
|
||||||
|
fmt.Fprintln(stdout, Version)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
runtimeInfo, err := runtimeenv.Detect(*runtimeFlag)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("resolve runtime", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
slog.Info("runtime resolved", "mode", runtimeInfo.Mode, "reason", runtimeInfo.Reason)
|
||||||
|
|
||||||
|
application := app.New(platform.New())
|
||||||
|
path, err := application.RunAudit(runtimeInfo.Mode, *output)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("run audit", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
if path != "stdout" {
|
||||||
|
slog.Info("audit output written", "path", path)
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
func runExport(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
targetDevice := fs.String("target", "", "removable device path, e.g. /dev/sdb1")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintln(stderr, "usage: bee export --target <device>")
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(*targetDevice) == "" {
|
||||||
|
fmt.Fprintln(stderr, "bee export: --target is required")
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
application := app.New(platform.New())
|
||||||
|
targets, err := application.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("list removable targets", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, target := range targets {
|
||||||
|
if target.Device == *targetDevice {
|
||||||
|
path, err := application.ExportLatestAudit(target)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("export latest audit", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
slog.Info("audit exported", "path", path)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Error("target device not found among removable filesystems", "device", *targetDevice)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
func runPreflight(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("preflight", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintf(stderr, "usage: bee preflight [--output stdout|file:%s]\n", app.DefaultRuntimeJSONPath)
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
application := app.New(platform.New())
|
||||||
|
path, err := application.RunRuntimePreflight(*output)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("run preflight", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
if path != "stdout" {
|
||||||
|
slog.Info("runtime health written", "path", path)
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func runSupportBundle(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintln(stderr, "usage: bee support-bundle [--output stdout|file:<path>]")
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
path, err := app.BuildSupportBundle(app.DefaultExportDir)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("build support bundle", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
defer os.Remove(path)
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("read support bundle", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case *output == "stdout":
|
||||||
|
if _, err := stdout.Write(raw); err != nil {
|
||||||
|
slog.Error("write support bundle stdout", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(*output, "file:"):
|
||||||
|
dst := strings.TrimPrefix(*output, "file:")
|
||||||
|
if err := os.WriteFile(dst, raw, 0644); err != nil {
|
||||||
|
slog.Error("write support bundle", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
slog.Info("support bundle written", "path", dst)
|
||||||
|
default:
|
||||||
|
fmt.Fprintln(stderr, "bee support-bundle: unknown output destination")
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
||||||
|
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
|
||||||
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||||
|
title := fs.String("title", "Bee Hardware Audit", "page title")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintf(stderr, "usage: bee web [--listen :80] [--audit-path %s] [--export-dir %s] [--title \"Bee Hardware Audit\"]\n", app.DefaultAuditJSONPath, app.DefaultExportDir)
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info("starting bee web", "listen", *listenAddr, "audit_path", *auditPath)
|
||||||
|
|
||||||
|
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("resolve runtime for web", "err", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
||||||
|
Title: *title,
|
||||||
|
AuditPath: *auditPath,
|
||||||
|
ExportDir: *exportDir,
|
||||||
|
App: app.New(platform.New()),
|
||||||
|
RuntimeMode: runtimeInfo.Mode,
|
||||||
|
}); err != nil {
|
||||||
|
slog.Error("run web", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||||
|
if len(args) == 0 {
|
||||||
|
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
||||||
|
fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
||||||
|
if err := fs.Parse(args[1:]); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fmt.Fprintf(stderr, "bee sat: unexpected arguments\n")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
target := args[0]
|
||||||
|
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
||||||
|
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
||||||
|
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
application := app.New(platform.New())
|
||||||
|
var (
|
||||||
|
archive string
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
switch target {
|
||||||
|
case "nvidia":
|
||||||
|
archive, err = application.RunNvidiaAcceptancePack("")
|
||||||
|
case "memory":
|
||||||
|
archive, err = application.RunMemoryAcceptancePack("")
|
||||||
|
case "storage":
|
||||||
|
archive, err = application.RunStorageAcceptancePack("")
|
||||||
|
case "cpu":
|
||||||
|
dur := *duration
|
||||||
|
if dur <= 0 {
|
||||||
|
dur = 60
|
||||||
|
}
|
||||||
|
archive, err = application.RunCPUAcceptancePack("", dur)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("run sat", "target", target, "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
slog.Info("sat archive written", "target", target, "path", archive)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
219
audit/cmd/bee/main_test.go
Normal file
219
audit/cmd/bee/main_test.go
Normal file
@@ -0,0 +1,219 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRunRootHelp(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"help"}, &stdout, &stderr)
|
||||||
|
if rc != 0 {
|
||||||
|
t.Fatalf("rc=%d want 0", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stdout.String(), "bee commands:") {
|
||||||
|
t.Fatalf("stdout missing root usage:\n%s", stdout.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunNoArgsPrintsUsage(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run(nil, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), "bee commands:") {
|
||||||
|
t.Fatalf("stderr missing root usage:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunUnknownCommand(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"wat"}, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), `unknown command "wat"`) {
|
||||||
|
t.Fatalf("stderr missing unknown command message:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunVersion(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
old := Version
|
||||||
|
Version = "test-version"
|
||||||
|
t.Cleanup(func() { Version = old })
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"version"}, &stdout, &stderr)
|
||||||
|
if rc != 0 {
|
||||||
|
t.Fatalf("rc=%d want 0", rc)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(stdout.String()) != "test-version" {
|
||||||
|
t.Fatalf("stdout=%q want %q", strings.TrimSpace(stdout.String()), "test-version")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunExportRequiresTarget(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"export"}, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), "--target is required") {
|
||||||
|
t.Fatalf("stderr missing target error:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), "usage: bee export --target <device>") {
|
||||||
|
t.Fatalf("stderr missing export usage:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunSATUsage(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"sat"}, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), "usage: bee sat nvidia|memory|storage") {
|
||||||
|
t.Fatalf("stderr missing sat usage:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunPreflightRejectsExtraArgs(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"preflight", "extra"}, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), "usage: bee preflight") {
|
||||||
|
t.Fatalf("stderr missing preflight usage:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunSupportBundleRejectsExtraArgs(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"support-bundle", "extra"}, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), "usage: bee support-bundle") {
|
||||||
|
t.Fatalf("stderr missing support-bundle usage:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunHelpForSubcommand(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"help", "export"}, &stdout, &stderr)
|
||||||
|
if rc != 0 {
|
||||||
|
t.Fatalf("rc=%d want 0", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stdout.String(), "usage: bee export --target <device>") {
|
||||||
|
t.Fatalf("stdout missing export help:\n%s", stdout.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunHelpUnknownSubcommand(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"help", "wat"}, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), `bee help: unknown command "wat"`) {
|
||||||
|
t.Fatalf("stderr missing help error:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunSATUnknownTarget(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"sat", "amd"}, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), `unknown target "amd"`) {
|
||||||
|
t.Fatalf("stderr missing sat target error:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunSATHelp(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"sat", "--help"}, &stdout, &stderr)
|
||||||
|
if rc != 0 {
|
||||||
|
t.Fatalf("rc=%d want 0", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stdout.String(), "usage: bee sat nvidia|memory|storage|cpu") {
|
||||||
|
t.Fatalf("stdout missing sat help:\n%s", stdout.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunSATRejectsExtraArgs(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"sat", "memory", "extra"}, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), "bee sat: unexpected arguments") {
|
||||||
|
t.Fatalf("stderr missing sat error:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunAuditInvalidRuntime(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"audit", "--runtime", "bad"}, &stdout, &stderr)
|
||||||
|
if rc != 1 {
|
||||||
|
t.Fatalf("rc=%d want 1", rc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunAuditRejectsExtraArgs(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"audit", "extra"}, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), "usage: bee audit") {
|
||||||
|
t.Fatalf("stderr missing audit usage:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunExportRejectsExtraArgs(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
rc := run([]string{"export", "--target", "/dev/sdb1", "extra"}, &stdout, &stderr)
|
||||||
|
if rc != 2 {
|
||||||
|
t.Fatalf("rc=%d want 2", rc)
|
||||||
|
}
|
||||||
|
if !strings.Contains(stderr.String(), "usage: bee export --target <device>") {
|
||||||
|
t.Fatalf("stderr missing export usage:\n%s", stderr.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
16
audit/go.mod
16
audit/go.mod
@@ -1,3 +1,17 @@
|
|||||||
module bee/audit
|
module bee/audit
|
||||||
|
|
||||||
go 1.23
|
go 1.24.0
|
||||||
|
|
||||||
|
replace reanimator/chart => ../internal/chart
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/go-analyze/charts v0.5.26
|
||||||
|
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||||
|
)
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
|
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||||
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||||
|
golang.org/x/image v0.24.0 // indirect
|
||||||
|
)
|
||||||
|
|||||||
18
audit/go.sum
Normal file
18
audit/go.sum
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||||
|
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||||
|
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
||||||
|
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
||||||
|
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
||||||
|
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||||
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||||
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||||
|
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||||
|
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||||
|
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
1025
audit/internal/app/app.go
Normal file
1025
audit/internal/app/app.go
Normal file
File diff suppressed because it is too large
Load Diff
814
audit/internal/app/app_test.go
Normal file
814
audit/internal/app/app_test.go
Normal file
@@ -0,0 +1,814 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/tar"
|
||||||
|
"compress/gzip"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
type fakeNetwork struct {
|
||||||
|
listInterfacesFn func() ([]platform.InterfaceInfo, error)
|
||||||
|
defaultRouteFn func() string
|
||||||
|
dhcpOneFn func(string) (string, error)
|
||||||
|
dhcpAllFn func() (string, error)
|
||||||
|
setStaticIPv4Fn func(platform.StaticIPv4Config) (string, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeNetwork) ListInterfaces() ([]platform.InterfaceInfo, error) {
|
||||||
|
return f.listInterfacesFn()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeNetwork) DefaultRoute() string {
|
||||||
|
return f.defaultRouteFn()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeNetwork) DHCPOne(iface string) (string, error) {
|
||||||
|
return f.dhcpOneFn(iface)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeNetwork) DHCPAll() (string, error) {
|
||||||
|
return f.dhcpAllFn()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||||
|
return f.setStaticIPv4Fn(cfg)
|
||||||
|
}
|
||||||
|
|
||||||
|
type fakeServices struct {
|
||||||
|
serviceStatusFn func(string) (string, error)
|
||||||
|
serviceDoFn func(string, platform.ServiceAction) (string, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeServices) ListBeeServices() ([]string, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeServices) ServiceState(name string) string {
|
||||||
|
return "active"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeServices) ServiceStatus(name string) (string, error) {
|
||||||
|
return f.serviceStatusFn(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeServices) ServiceDo(name string, action platform.ServiceAction) (string, error) {
|
||||||
|
return f.serviceDoFn(name, action)
|
||||||
|
}
|
||||||
|
|
||||||
|
type fakeExports struct {
|
||||||
|
listTargetsFn func() ([]platform.RemovableTarget, error)
|
||||||
|
exportToTargetFn func(string, platform.RemovableTarget) (string, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||||
|
if f.listTargetsFn != nil {
|
||||||
|
return f.listTargetsFn()
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
|
||||||
|
if f.exportToTargetFn != nil {
|
||||||
|
return f.exportToTargetFn(src, target)
|
||||||
|
}
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type fakeRuntime struct {
|
||||||
|
collectFn func(string) (schema.RuntimeHealth, error)
|
||||||
|
dumpFn func(string) error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeRuntime) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
|
return f.collectFn(exportDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeRuntime) CaptureTechnicalDump(baseDir string) error {
|
||||||
|
if f.dumpFn != nil {
|
||||||
|
return f.dumpFn(baseDir)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type fakeTools struct {
|
||||||
|
tailFileFn func(string, int) string
|
||||||
|
checkToolsFn func([]string) []platform.ToolStatus
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeTools) TailFile(path string, lines int) string {
|
||||||
|
return f.tailFileFn(path, lines)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||||
|
return f.checkToolsFn(names)
|
||||||
|
}
|
||||||
|
|
||||||
|
type fakeSAT struct {
|
||||||
|
runNvidiaFn func(string) (string, error)
|
||||||
|
runMemoryFn func(string) (string, error)
|
||||||
|
runStorageFn func(string) (string, error)
|
||||||
|
runCPUFn func(string, int) (string, error)
|
||||||
|
detectVendorFn func() string
|
||||||
|
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||||
|
runAMDPackFn func(string) (string, error)
|
||||||
|
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int) (string, error) {
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||||
|
if f.listNvidiaGPUsFn != nil {
|
||||||
|
return f.listNvidiaGPUsFn()
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||||
|
return f.runMemoryFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||||
|
return f.runStorageFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
||||||
|
if f.runCPUFn != nil {
|
||||||
|
return f.runCPUFn(baseDir, durationSec)
|
||||||
|
}
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) DetectGPUVendor() string {
|
||||||
|
if f.detectVendorFn != nil {
|
||||||
|
return f.detectVendorFn()
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||||
|
if f.listAMDGPUsFn != nil {
|
||||||
|
return f.listAMDGPUsFn()
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
|
||||||
|
if f.runAMDPackFn != nil {
|
||||||
|
return f.runAMDPackFn(baseDir)
|
||||||
|
}
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
network: fakeNetwork{
|
||||||
|
listInterfacesFn: func() ([]platform.InterfaceInfo, error) {
|
||||||
|
return []platform.InterfaceInfo{
|
||||||
|
{Name: "eth0", State: "UP", IPv4: []string{"10.0.0.2/24"}},
|
||||||
|
{Name: "eth1", State: "DOWN", IPv4: nil},
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
defaultRouteFn: func() string { return "10.0.0.1" },
|
||||||
|
},
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := a.NetworkStatus()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("NetworkStatus error: %v", err)
|
||||||
|
}
|
||||||
|
if result.Title != "Network status" {
|
||||||
|
t.Fatalf("title=%q want %q", result.Title, "Network status")
|
||||||
|
}
|
||||||
|
if want := "- eth0: state=UP ip=10.0.0.2/24"; !contains(result.Body, want) {
|
||||||
|
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
|
||||||
|
}
|
||||||
|
if want := "- eth1: state=DOWN ip=(no IPv4)"; !contains(result.Body, want) {
|
||||||
|
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
|
||||||
|
}
|
||||||
|
if want := "Default route: 10.0.0.1"; !contains(result.Body, want) {
|
||||||
|
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetworkStatusHandlesNoInterfaces(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
network: fakeNetwork{
|
||||||
|
listInterfacesFn: func() ([]platform.InterfaceInfo, error) { return nil, nil },
|
||||||
|
defaultRouteFn: func() string { return "" },
|
||||||
|
},
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := a.NetworkStatus()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("NetworkStatus error: %v", err)
|
||||||
|
}
|
||||||
|
if result.Body != "No physical interfaces found." {
|
||||||
|
t.Fatalf("body=%q want %q", result.Body, "No physical interfaces found.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNetworkStatusPropagatesListError(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
network: fakeNetwork{
|
||||||
|
listInterfacesFn: func() ([]platform.InterfaceInfo, error) {
|
||||||
|
return nil, errors.New("boom")
|
||||||
|
},
|
||||||
|
defaultRouteFn: func() string { return "" },
|
||||||
|
},
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := a.NetworkStatus()
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if result.Title != "Network status" {
|
||||||
|
t.Fatalf("title=%q want %q", result.Title, "Network status")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseStaticIPv4ConfigAndDefaults(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
network: fakeNetwork{
|
||||||
|
defaultRouteFn: func() string { return " 192.168.1.1 " },
|
||||||
|
listInterfacesFn: func() ([]platform.InterfaceInfo, error) {
|
||||||
|
return nil, nil
|
||||||
|
},
|
||||||
|
dhcpOneFn: func(string) (string, error) { return "", nil },
|
||||||
|
dhcpAllFn: func() (string, error) { return "", nil },
|
||||||
|
setStaticIPv4Fn: func(platform.StaticIPv4Config) (string, error) { return "", nil },
|
||||||
|
},
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
defaults := a.DefaultStaticIPv4FormFields("eth0")
|
||||||
|
if len(defaults) != 4 {
|
||||||
|
t.Fatalf("len(defaults)=%d want 4", len(defaults))
|
||||||
|
}
|
||||||
|
if defaults[1] != "24" || defaults[2] != "192.168.1.1" {
|
||||||
|
t.Fatalf("unexpected defaults: %#v", defaults)
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := a.ParseStaticIPv4Config("eth0", []string{
|
||||||
|
" 10.10.0.5 ",
|
||||||
|
" 23 ",
|
||||||
|
" 10.10.0.1 ",
|
||||||
|
" 1.1.1.1 8.8.8.8 ",
|
||||||
|
})
|
||||||
|
if cfg.Interface != "eth0" || cfg.Address != "10.10.0.5" || cfg.Prefix != "23" || cfg.Gateway != "10.10.0.1" {
|
||||||
|
t.Fatalf("unexpected cfg: %#v", cfg)
|
||||||
|
}
|
||||||
|
if len(cfg.DNS) != 2 || cfg.DNS[0] != "1.1.1.1" || cfg.DNS[1] != "8.8.8.8" {
|
||||||
|
t.Fatalf("unexpected dns: %#v", cfg.DNS)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestServiceActionResults(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
services: fakeServices{
|
||||||
|
serviceStatusFn: func(name string) (string, error) {
|
||||||
|
return "active", nil
|
||||||
|
},
|
||||||
|
serviceDoFn: func(name string, action platform.ServiceAction) (string, error) {
|
||||||
|
return string(action) + " ok", nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
statusResult, err := a.ServiceStatusResult("bee-audit")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ServiceStatusResult error: %v", err)
|
||||||
|
}
|
||||||
|
if statusResult.Title != "service status: bee-audit" || statusResult.Body != "active" {
|
||||||
|
t.Fatalf("unexpected status result: %#v", statusResult)
|
||||||
|
}
|
||||||
|
|
||||||
|
actionResult, err := a.ServiceActionResult("bee-audit", platform.ServiceRestart)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ServiceActionResult error: %v", err)
|
||||||
|
}
|
||||||
|
if actionResult.Title != "service restart: bee-audit" || actionResult.Body != "restart ok" {
|
||||||
|
t.Fatalf("unexpected action result: %#v", actionResult)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestToolCheckAndLogTailResults(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
tools: fakeTools{
|
||||||
|
tailFileFn: func(path string, lines int) string {
|
||||||
|
return path
|
||||||
|
},
|
||||||
|
checkToolsFn: func(names []string) []platform.ToolStatus {
|
||||||
|
return []platform.ToolStatus{
|
||||||
|
{Name: "dmidecode", OK: true, Path: "/usr/bin/dmidecode"},
|
||||||
|
{Name: "smartctl", OK: false},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
toolsResult := a.ToolCheckResult([]string{"dmidecode", "smartctl"})
|
||||||
|
if toolsResult.Title != "Required tools" {
|
||||||
|
t.Fatalf("title=%q want %q", toolsResult.Title, "Required tools")
|
||||||
|
}
|
||||||
|
if want := "- dmidecode: OK (/usr/bin/dmidecode)"; !contains(toolsResult.Body, want) {
|
||||||
|
t.Fatalf("body missing %q\nbody=%s", want, toolsResult.Body)
|
||||||
|
}
|
||||||
|
if want := "- smartctl: MISSING"; !contains(toolsResult.Body, want) {
|
||||||
|
t.Fatalf("body missing %q\nbody=%s", want, toolsResult.Body)
|
||||||
|
}
|
||||||
|
|
||||||
|
logResult := a.AuditLogTailResult()
|
||||||
|
if logResult.Title != "Audit log tail" {
|
||||||
|
t.Fatalf("title=%q want %q", logResult.Title, "Audit log tail")
|
||||||
|
}
|
||||||
|
if want := DefaultAuditLogPath + "\n\n" + DefaultAuditJSONPath; logResult.Body != want {
|
||||||
|
t.Fatalf("body=%q want %q", logResult.Body, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
network: fakeNetwork{
|
||||||
|
dhcpOneFn: func(string) (string, error) { return " ", nil },
|
||||||
|
dhcpAllFn: func() (string, error) { return "", nil },
|
||||||
|
setStaticIPv4Fn: func(platform.StaticIPv4Config) (string, error) { return "", nil },
|
||||||
|
listInterfacesFn: func() ([]platform.InterfaceInfo, error) {
|
||||||
|
return nil, nil
|
||||||
|
},
|
||||||
|
defaultRouteFn: func() string { return "" },
|
||||||
|
},
|
||||||
|
services: fakeServices{
|
||||||
|
serviceStatusFn: func(string) (string, error) { return "", nil },
|
||||||
|
serviceDoFn: func(string, platform.ServiceAction) (string, error) { return "", nil },
|
||||||
|
},
|
||||||
|
tools: fakeTools{
|
||||||
|
tailFileFn: func(string, int) string { return " " },
|
||||||
|
checkToolsFn: func([]string) []platform.ToolStatus { return nil },
|
||||||
|
},
|
||||||
|
sat: fakeSAT{
|
||||||
|
runNvidiaFn: func(string) (string, error) { return "", nil },
|
||||||
|
runMemoryFn: func(string) (string, error) { return "", nil },
|
||||||
|
runStorageFn: func(string) (string, error) { return "", nil },
|
||||||
|
},
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(string) (schema.RuntimeHealth, error) {
|
||||||
|
return schema.RuntimeHealth{Status: "PARTIAL", ExportDir: "/tmp/export"}, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if got, _ := a.DHCPOneResult("eth0"); got.Body != "DHCP completed." {
|
||||||
|
t.Fatalf("dhcp one body=%q", got.Body)
|
||||||
|
}
|
||||||
|
if got, _ := a.DHCPAllResult(); got.Body != "DHCP completed." {
|
||||||
|
t.Fatalf("dhcp all body=%q", got.Body)
|
||||||
|
}
|
||||||
|
if got, _ := a.SetStaticIPv4Result(platform.StaticIPv4Config{Interface: "eth0"}); got.Body != "Static IPv4 updated." {
|
||||||
|
t.Fatalf("static body=%q", got.Body)
|
||||||
|
}
|
||||||
|
if got, _ := a.ServiceStatusResult("bee-audit"); got.Body != "No status output." {
|
||||||
|
t.Fatalf("status body=%q", got.Body)
|
||||||
|
}
|
||||||
|
if got, _ := a.ServiceActionResult("bee-audit", platform.ServiceRestart); got.Body != "Action completed." {
|
||||||
|
t.Fatalf("action body=%q", got.Body)
|
||||||
|
}
|
||||||
|
if got := a.ToolCheckResult(nil); got.Body != "No tools checked." {
|
||||||
|
t.Fatalf("tool body=%q", got.Body)
|
||||||
|
}
|
||||||
|
if got := a.AuditLogTailResult(); got.Body != "No audit logs found." {
|
||||||
|
t.Fatalf("log body=%q", got.Body)
|
||||||
|
}
|
||||||
|
if got, _ := a.RunNvidiaAcceptancePackResult(""); got.Body != "Archive written." {
|
||||||
|
t.Fatalf("sat body=%q", got.Body)
|
||||||
|
}
|
||||||
|
if got, _ := a.RunMemoryAcceptancePackResult(""); got.Body != "No output produced." {
|
||||||
|
t.Fatalf("memory sat body=%q", got.Body)
|
||||||
|
}
|
||||||
|
if got, _ := a.RunStorageAcceptancePackResult(""); got.Body != "No output produced." {
|
||||||
|
t.Fatalf("storage sat body=%q", got.Body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldExportDir := DefaultExportDir
|
||||||
|
DefaultExportDir = tmp
|
||||||
|
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||||
|
|
||||||
|
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("write bee-audit.json: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("write bee-audit.log: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
exports: fakeExports{
|
||||||
|
exportToTargetFn: func(src string, target platform.RemovableTarget) (string, error) {
|
||||||
|
if filepath.Base(src) == "" {
|
||||||
|
t.Fatalf("expected non-empty source path")
|
||||||
|
}
|
||||||
|
return "/media/bee/" + filepath.Base(src), nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sdb1"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExportSupportBundleResult error: %v", err)
|
||||||
|
}
|
||||||
|
if result.Title != "Export support bundle" {
|
||||||
|
t.Fatalf("title=%q want %q", result.Title, "Export support bundle")
|
||||||
|
}
|
||||||
|
if want := "USB target unmounted and safe to remove."; !contains(result.Body, want) {
|
||||||
|
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldExportDir := DefaultExportDir
|
||||||
|
DefaultExportDir = tmp
|
||||||
|
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||||
|
|
||||||
|
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("write bee-audit.json: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("write bee-audit.log: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
exports: fakeExports{
|
||||||
|
exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
|
||||||
|
return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected export error")
|
||||||
|
}
|
||||||
|
if contains(result.Body, "exported to") {
|
||||||
|
t.Fatalf("body should not claim success:\n%s", result.Body)
|
||||||
|
}
|
||||||
|
if result.Body != "Support bundle export failed." {
|
||||||
|
t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
sat: fakeSAT{
|
||||||
|
runNvidiaFn: func(baseDir string) (string, error) {
|
||||||
|
if baseDir != "/tmp/sat" {
|
||||||
|
t.Fatalf("baseDir=%q want %q", baseDir, "/tmp/sat")
|
||||||
|
}
|
||||||
|
return "/tmp/sat/out.tar.gz", nil
|
||||||
|
},
|
||||||
|
runMemoryFn: func(string) (string, error) { return "", nil },
|
||||||
|
runStorageFn: func(string) (string, error) { return "", nil },
|
||||||
|
},
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := a.RunNvidiaAcceptancePackResult("/tmp/sat")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("RunNvidiaAcceptancePackResult error: %v", err)
|
||||||
|
}
|
||||||
|
if result.Title != "NVIDIA SAT" || result.Body != "Archive written to /tmp/sat/out.tar.gz" {
|
||||||
|
t.Fatalf("unexpected result: %#v", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldSATBaseDir := DefaultSATBaseDir
|
||||||
|
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||||
|
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
sat: fakeSAT{
|
||||||
|
runNvidiaFn: func(baseDir string) (string, error) {
|
||||||
|
if baseDir != "/tmp/export/bee-sat" {
|
||||||
|
t.Fatalf("nvidia baseDir=%q", baseDir)
|
||||||
|
}
|
||||||
|
return "", nil
|
||||||
|
},
|
||||||
|
runMemoryFn: func(baseDir string) (string, error) {
|
||||||
|
if baseDir != "/tmp/export/bee-sat" {
|
||||||
|
t.Fatalf("memory baseDir=%q", baseDir)
|
||||||
|
}
|
||||||
|
return "", nil
|
||||||
|
},
|
||||||
|
runStorageFn: func(baseDir string) (string, error) {
|
||||||
|
if baseDir != "/tmp/export/bee-sat" {
|
||||||
|
t.Fatalf("storage baseDir=%q", baseDir)
|
||||||
|
}
|
||||||
|
return "", nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := a.RunNvidiaAcceptancePack(""); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if _, err := a.RunMemoryAcceptancePack(""); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if _, err := a.RunStorageAcceptancePack(""); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFormatSATSummary(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := formatSATSummary("Memory SAT", "overall_status=PARTIAL\njob_ok=2\njob_failed=0\njob_unsupported=1\ndevices=3\n")
|
||||||
|
want := "Memory SAT: PARTIAL ok=2 failed=0 unsupported=1\nDevices: 3"
|
||||||
|
if got != want {
|
||||||
|
t.Fatalf("got %q want %q", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldAuditPath := DefaultAuditJSONPath
|
||||||
|
oldSATBaseDir := DefaultSATBaseDir
|
||||||
|
DefaultAuditJSONPath = filepath.Join(tmp, "audit.json")
|
||||||
|
DefaultSATBaseDir = filepath.Join(tmp, "sat")
|
||||||
|
t.Cleanup(func() { DefaultAuditJSONPath = oldAuditPath })
|
||||||
|
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||||
|
|
||||||
|
satDir := filepath.Join(DefaultSATBaseDir, "memory-testcase")
|
||||||
|
if err := os.MkdirAll(satDir, 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir sat dir: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
raw := `{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"serial_number":"DISK1","status":"Warning"}]}}`
|
||||||
|
if err := os.WriteFile(DefaultAuditJSONPath, []byte(raw), 0644); err != nil {
|
||||||
|
t.Fatalf("write audit json: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(satDir, "summary.txt"), []byte("overall_status=OK\njob_ok=3\njob_failed=0\njob_unsupported=0\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("write sat summary: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
result := (&App{}).HealthSummaryResult()
|
||||||
|
if !contains(result.Body, "Memory SAT: OK ok=3 failed=0") {
|
||||||
|
t.Fatalf("body missing compact sat summary:\n%s", result.Body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
exportDir := filepath.Join(tmp, "export")
|
||||||
|
if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
archive, err := BuildSupportBundle(exportDir)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("BuildSupportBundle error: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(archive); err != nil {
|
||||||
|
t.Fatalf("archive stat: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
file, err := os.Open(archive)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("open archive: %v", err)
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
gzr, err := gzip.NewReader(file)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("gzip reader: %v", err)
|
||||||
|
}
|
||||||
|
defer gzr.Close()
|
||||||
|
|
||||||
|
tr := tar.NewReader(gzr)
|
||||||
|
var names []string
|
||||||
|
for {
|
||||||
|
hdr, err := tr.Next()
|
||||||
|
if errors.Is(err, io.EOF) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read tar entry: %v", err)
|
||||||
|
}
|
||||||
|
names = append(names, hdr.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
var foundRaw bool
|
||||||
|
for _, name := range names {
|
||||||
|
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
||||||
|
foundRaw = true
|
||||||
|
}
|
||||||
|
if contains(name, "/export/bee-sat/memory-run.tar.gz") {
|
||||||
|
t.Fatalf("support bundle should not contain nested SAT archive: %s", name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !foundRaw {
|
||||||
|
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMainBanner(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldAuditPath := DefaultAuditJSONPath
|
||||||
|
DefaultAuditJSONPath = filepath.Join(tmp, "audit.json")
|
||||||
|
t.Cleanup(func() { DefaultAuditJSONPath = oldAuditPath })
|
||||||
|
|
||||||
|
trueValue := true
|
||||||
|
manufacturer := "Dell"
|
||||||
|
product := "PowerEdge R760"
|
||||||
|
cpuModel := "Intel Xeon Gold 6430"
|
||||||
|
memoryType := "DDR5"
|
||||||
|
gpuClass := "VideoController"
|
||||||
|
gpuModel := "NVIDIA H100"
|
||||||
|
|
||||||
|
payload := schema.HardwareIngestRequest{
|
||||||
|
Hardware: schema.HardwareSnapshot{
|
||||||
|
Board: schema.HardwareBoard{
|
||||||
|
Manufacturer: &manufacturer,
|
||||||
|
ProductName: &product,
|
||||||
|
SerialNumber: "SRV123",
|
||||||
|
},
|
||||||
|
CPUs: []schema.HardwareCPU{
|
||||||
|
{Model: &cpuModel},
|
||||||
|
{Model: &cpuModel},
|
||||||
|
},
|
||||||
|
Memory: []schema.HardwareMemory{
|
||||||
|
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||||
|
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||||
|
},
|
||||||
|
Storage: []schema.HardwareStorage{
|
||||||
|
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||||
|
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||||
|
},
|
||||||
|
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||||
|
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||||
|
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := json.Marshal(payload)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("marshal: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(DefaultAuditJSONPath, raw, 0644); err != nil {
|
||||||
|
t.Fatalf("write audit json: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
network: fakeNetwork{
|
||||||
|
listInterfacesFn: func() ([]platform.InterfaceInfo, error) {
|
||||||
|
return []platform.InterfaceInfo{
|
||||||
|
{Name: "eth0", IPv4: []string{"10.0.0.10"}},
|
||||||
|
{Name: "eth1", IPv4: []string{"192.168.1.10"}},
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := a.MainBanner()
|
||||||
|
for _, want := range []string{
|
||||||
|
"System: Dell PowerEdge R760 | S/N SRV123",
|
||||||
|
"CPU: 2 x Intel Xeon Gold 6430",
|
||||||
|
"Memory: 1.0 TB DDR5 (2 DIMMs)",
|
||||||
|
"Storage: 2 drives / 7.5 TB",
|
||||||
|
"GPU: 2 x NVIDIA H100",
|
||||||
|
"IP: 10.0.0.10, 192.168.1.10",
|
||||||
|
} {
|
||||||
|
if !contains(got, want) {
|
||||||
|
t.Fatalf("banner missing %q:\n%s", want, got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRuntimeHealthResultUsesAMDLabels(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldRuntimePath := DefaultRuntimeJSONPath
|
||||||
|
DefaultRuntimeJSONPath = filepath.Join(tmp, "runtime-health.json")
|
||||||
|
t.Cleanup(func() { DefaultRuntimeJSONPath = oldRuntimePath })
|
||||||
|
|
||||||
|
raw, err := json.Marshal(schema.RuntimeHealth{
|
||||||
|
Status: "OK",
|
||||||
|
ExportDir: "/appdata/bee/export",
|
||||||
|
DriverReady: true,
|
||||||
|
CUDAReady: true,
|
||||||
|
NetworkStatus: "OK",
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("marshal runtime health: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(DefaultRuntimeJSONPath, raw, 0644); err != nil {
|
||||||
|
t.Fatalf("write runtime health: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
sat: fakeSAT{
|
||||||
|
detectVendorFn: func() string { return "amd" },
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result := a.RuntimeHealthResult()
|
||||||
|
if !contains(result.Body, "AMDGPU ready: true") {
|
||||||
|
t.Fatalf("body missing AMD driver label:\n%s", result.Body)
|
||||||
|
}
|
||||||
|
if !contains(result.Body, "ROCm SMI ready: true") {
|
||||||
|
t.Fatalf("body missing ROCm label:\n%s", result.Body)
|
||||||
|
}
|
||||||
|
if contains(result.Body, "CUDA ready") {
|
||||||
|
t.Fatalf("body should not mention CUDA on AMD:\n%s", result.Body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func intPtr(v int) *int { return &v }
|
||||||
|
|
||||||
|
func contains(haystack, needle string) bool {
|
||||||
|
return len(needle) == 0 || (len(haystack) >= len(needle) && (haystack == needle || containsAt(haystack, needle)))
|
||||||
|
}
|
||||||
|
|
||||||
|
func containsAt(haystack, needle string) bool {
|
||||||
|
for i := 0; i+len(needle) <= len(haystack); i++ {
|
||||||
|
if haystack[i:i+len(needle)] == needle {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
387
audit/internal/app/panel.go
Normal file
387
audit/internal/app/panel.go
Normal file
@@ -0,0 +1,387 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ComponentRow is one line in the hardware panel.
|
||||||
|
type ComponentRow struct {
|
||||||
|
Key string // "CPU", "MEM", "GPU", "DISK", "PSU"
|
||||||
|
Status string // "PASS", "FAIL", "CANCEL", "N/A"
|
||||||
|
Detail string // compact one-liner
|
||||||
|
}
|
||||||
|
|
||||||
|
// HardwarePanelData holds everything the TUI right panel needs.
|
||||||
|
type HardwarePanelData struct {
|
||||||
|
Header []string
|
||||||
|
Rows []ComponentRow
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadHardwarePanel reads the latest audit JSON and SAT summaries.
|
||||||
|
// Returns empty panel if no audit data exists yet.
|
||||||
|
func (a *App) LoadHardwarePanel() HardwarePanelData {
|
||||||
|
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
||||||
|
if err != nil {
|
||||||
|
return HardwarePanelData{Header: []string{"No audit data — run audit first."}}
|
||||||
|
}
|
||||||
|
var snap schema.HardwareIngestRequest
|
||||||
|
if err := json.Unmarshal(raw, &snap); err != nil {
|
||||||
|
return HardwarePanelData{Header: []string{"Audit data unreadable."}}
|
||||||
|
}
|
||||||
|
|
||||||
|
statuses := satStatuses()
|
||||||
|
|
||||||
|
var header []string
|
||||||
|
if sys := formatSystemLine(snap.Hardware.Board); sys != "" {
|
||||||
|
header = append(header, sys)
|
||||||
|
}
|
||||||
|
for _, fw := range snap.Hardware.Firmware {
|
||||||
|
if fw.DeviceName == "BIOS" && fw.Version != "" {
|
||||||
|
header = append(header, "BIOS: "+fw.Version)
|
||||||
|
}
|
||||||
|
if fw.DeviceName == "BMC" && fw.Version != "" {
|
||||||
|
header = append(header, "BMC: "+fw.Version)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ip := formatIPLine(a.network.ListInterfaces); ip != "" {
|
||||||
|
header = append(header, ip)
|
||||||
|
}
|
||||||
|
|
||||||
|
var rows []ComponentRow
|
||||||
|
|
||||||
|
if cpu := formatCPULine(snap.Hardware.CPUs); cpu != "" {
|
||||||
|
rows = append(rows, ComponentRow{
|
||||||
|
Key: "CPU",
|
||||||
|
Status: statuses["cpu"],
|
||||||
|
Detail: strings.TrimPrefix(cpu, "CPU: "),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if mem := formatMemoryLine(snap.Hardware.Memory); mem != "" {
|
||||||
|
rows = append(rows, ComponentRow{
|
||||||
|
Key: "MEM",
|
||||||
|
Status: statuses["memory"],
|
||||||
|
Detail: strings.TrimPrefix(mem, "Memory: "),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if gpu := formatGPULine(snap.Hardware.PCIeDevices); gpu != "" {
|
||||||
|
rows = append(rows, ComponentRow{
|
||||||
|
Key: "GPU",
|
||||||
|
Status: statuses["gpu"],
|
||||||
|
Detail: strings.TrimPrefix(gpu, "GPU: "),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if disk := formatStorageLine(snap.Hardware.Storage); disk != "" {
|
||||||
|
rows = append(rows, ComponentRow{
|
||||||
|
Key: "DISK",
|
||||||
|
Status: statuses["storage"],
|
||||||
|
Detail: strings.TrimPrefix(disk, "Storage: "),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if psu := formatPSULine(snap.Hardware.PowerSupplies); psu != "" {
|
||||||
|
rows = append(rows, ComponentRow{
|
||||||
|
Key: "PSU",
|
||||||
|
Status: "N/A",
|
||||||
|
Detail: psu,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return HardwarePanelData{Header: header, Rows: rows}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComponentDetailResult returns detail text for a component shown in the panel.
|
||||||
|
func (a *App) ComponentDetailResult(key string) ActionResult {
|
||||||
|
switch key {
|
||||||
|
case "CPU":
|
||||||
|
return a.cpuDetailResult(false)
|
||||||
|
case "MEM":
|
||||||
|
return a.satDetailResult("memory", "memory-", "MEM detail")
|
||||||
|
case "GPU":
|
||||||
|
// Prefer whichever GPU SAT was run most recently.
|
||||||
|
nv, _ := filepath.Glob(filepath.Join(DefaultSATBaseDir, "gpu-nvidia-*/summary.txt"))
|
||||||
|
am, _ := filepath.Glob(filepath.Join(DefaultSATBaseDir, "gpu-amd-*/summary.txt"))
|
||||||
|
sort.Strings(nv)
|
||||||
|
sort.Strings(am)
|
||||||
|
latestNV := ""
|
||||||
|
if len(nv) > 0 {
|
||||||
|
latestNV = nv[len(nv)-1]
|
||||||
|
}
|
||||||
|
latestAM := ""
|
||||||
|
if len(am) > 0 {
|
||||||
|
latestAM = am[len(am)-1]
|
||||||
|
}
|
||||||
|
if latestAM > latestNV {
|
||||||
|
return a.satDetailResult("gpu", "gpu-amd-", "GPU detail")
|
||||||
|
}
|
||||||
|
return a.satDetailResult("gpu", "gpu-nvidia-", "GPU detail")
|
||||||
|
case "DISK":
|
||||||
|
return a.satDetailResult("storage", "storage-", "DISK detail")
|
||||||
|
case "PSU":
|
||||||
|
return a.psuDetailResult()
|
||||||
|
default:
|
||||||
|
return ActionResult{Title: key, Body: "No detail available."}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) cpuDetailResult(satOnly bool) ActionResult {
|
||||||
|
var b strings.Builder
|
||||||
|
|
||||||
|
// Show latest SAT summary if available.
|
||||||
|
satResult := a.satDetailResult("cpu", "cpu-", "CPU SAT")
|
||||||
|
if satResult.Body != "No test results found. Run a test first." {
|
||||||
|
fmt.Fprintln(&b, "=== Last SAT ===")
|
||||||
|
fmt.Fprintln(&b, satResult.Body)
|
||||||
|
fmt.Fprintln(&b)
|
||||||
|
}
|
||||||
|
|
||||||
|
if satOnly {
|
||||||
|
body := strings.TrimSpace(b.String())
|
||||||
|
if body == "" {
|
||||||
|
body = "No CPU SAT results found. Run a test first."
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "CPU SAT", Body: body}
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
||||||
|
if err != nil {
|
||||||
|
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
||||||
|
}
|
||||||
|
var snap schema.HardwareIngestRequest
|
||||||
|
if err := json.Unmarshal(raw, &snap); err != nil {
|
||||||
|
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
||||||
|
}
|
||||||
|
if len(snap.Hardware.CPUs) == 0 {
|
||||||
|
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
||||||
|
}
|
||||||
|
fmt.Fprintln(&b, "=== Audit ===")
|
||||||
|
for i, cpu := range snap.Hardware.CPUs {
|
||||||
|
fmt.Fprintf(&b, "CPU %d\n", i)
|
||||||
|
if cpu.Model != nil {
|
||||||
|
fmt.Fprintf(&b, " Model: %s\n", *cpu.Model)
|
||||||
|
}
|
||||||
|
if cpu.Manufacturer != nil {
|
||||||
|
fmt.Fprintf(&b, " Vendor: %s\n", *cpu.Manufacturer)
|
||||||
|
}
|
||||||
|
if cpu.Cores != nil {
|
||||||
|
fmt.Fprintf(&b, " Cores: %d\n", *cpu.Cores)
|
||||||
|
}
|
||||||
|
if cpu.Threads != nil {
|
||||||
|
fmt.Fprintf(&b, " Threads: %d\n", *cpu.Threads)
|
||||||
|
}
|
||||||
|
if cpu.MaxFrequencyMHz != nil {
|
||||||
|
fmt.Fprintf(&b, " Max freq: %d MHz\n", *cpu.MaxFrequencyMHz)
|
||||||
|
}
|
||||||
|
if cpu.TemperatureC != nil {
|
||||||
|
fmt.Fprintf(&b, " Temp: %.1f°C\n", *cpu.TemperatureC)
|
||||||
|
}
|
||||||
|
if cpu.Throttled != nil {
|
||||||
|
fmt.Fprintf(&b, " Throttled: %v\n", *cpu.Throttled)
|
||||||
|
}
|
||||||
|
if cpu.CorrectableErrorCount != nil && *cpu.CorrectableErrorCount > 0 {
|
||||||
|
fmt.Fprintf(&b, " ECC correctable: %d\n", *cpu.CorrectableErrorCount)
|
||||||
|
}
|
||||||
|
if cpu.UncorrectableErrorCount != nil && *cpu.UncorrectableErrorCount > 0 {
|
||||||
|
fmt.Fprintf(&b, " ECC uncorrectable: %d\n", *cpu.UncorrectableErrorCount)
|
||||||
|
}
|
||||||
|
if i < len(snap.Hardware.CPUs)-1 {
|
||||||
|
fmt.Fprintln(&b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) satDetailResult(statusKey, prefix, title string) ActionResult {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, prefix+"*/summary.txt"))
|
||||||
|
if err != nil || len(matches) == 0 {
|
||||||
|
return ActionResult{Title: title, Body: "No test results found. Run a test first."}
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||||
|
if err != nil {
|
||||||
|
return ActionResult{Title: title, Body: "Could not read test results."}
|
||||||
|
}
|
||||||
|
return ActionResult{Title: title, Body: formatSATDetail(strings.TrimSpace(string(raw)))}
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatSATDetail converts raw summary.txt key=value content to a human-readable per-step display.
|
||||||
|
func formatSATDetail(raw string) string {
|
||||||
|
var b strings.Builder
|
||||||
|
kv := parseKeyValueSummary(raw)
|
||||||
|
|
||||||
|
if t, ok := kv["run_at_utc"]; ok {
|
||||||
|
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect step names in order they appear in the file
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
var stepKeys []string
|
||||||
|
seenStep := map[string]bool{}
|
||||||
|
for _, line := range lines {
|
||||||
|
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||||
|
key := line[:idx]
|
||||||
|
if !seenStep[key] && key != "overall" {
|
||||||
|
seenStep[key] = true
|
||||||
|
stepKeys = append(stepKeys, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, key := range stepKeys {
|
||||||
|
status := kv[key+"_status"]
|
||||||
|
display := cleanSummaryKey(key)
|
||||||
|
switch status {
|
||||||
|
case "OK":
|
||||||
|
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||||
|
case "FAILED":
|
||||||
|
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||||
|
case "UNSUPPORTED":
|
||||||
|
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||||
|
default:
|
||||||
|
fmt.Fprintf(&b, "? %s\n", display)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if overall, ok := kv["overall_status"]; ok {
|
||||||
|
ok2 := kv["job_ok"]
|
||||||
|
failed := kv["job_failed"]
|
||||||
|
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimSpace(b.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanSummaryKey strips the leading numeric prefix from a SAT step key.
|
||||||
|
// "1-lscpu" → "lscpu", "3-stress-ng" → "stress-ng"
|
||||||
|
func cleanSummaryKey(key string) string {
|
||||||
|
idx := strings.Index(key, "-")
|
||||||
|
if idx <= 0 {
|
||||||
|
return key
|
||||||
|
}
|
||||||
|
prefix := key[:idx]
|
||||||
|
for _, c := range prefix {
|
||||||
|
if c < '0' || c > '9' {
|
||||||
|
return key
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return key[idx+1:]
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) psuDetailResult() ActionResult {
|
||||||
|
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
||||||
|
if err != nil {
|
||||||
|
return ActionResult{Title: "PSU", Body: "No audit data."}
|
||||||
|
}
|
||||||
|
var snap schema.HardwareIngestRequest
|
||||||
|
if err := json.Unmarshal(raw, &snap); err != nil {
|
||||||
|
return ActionResult{Title: "PSU", Body: "Audit data unreadable."}
|
||||||
|
}
|
||||||
|
if len(snap.Hardware.PowerSupplies) == 0 {
|
||||||
|
return ActionResult{Title: "PSU", Body: "No PSU data in last audit."}
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
for i, psu := range snap.Hardware.PowerSupplies {
|
||||||
|
fmt.Fprintf(&b, "PSU %d\n", i)
|
||||||
|
if psu.Model != nil {
|
||||||
|
fmt.Fprintf(&b, " Model: %s\n", *psu.Model)
|
||||||
|
}
|
||||||
|
if psu.Vendor != nil {
|
||||||
|
fmt.Fprintf(&b, " Vendor: %s\n", *psu.Vendor)
|
||||||
|
}
|
||||||
|
if psu.WattageW != nil {
|
||||||
|
fmt.Fprintf(&b, " Rated: %d W\n", *psu.WattageW)
|
||||||
|
}
|
||||||
|
if psu.InputPowerW != nil {
|
||||||
|
fmt.Fprintf(&b, " Input: %.1f W\n", *psu.InputPowerW)
|
||||||
|
}
|
||||||
|
if psu.OutputPowerW != nil {
|
||||||
|
fmt.Fprintf(&b, " Output: %.1f W\n", *psu.OutputPowerW)
|
||||||
|
}
|
||||||
|
if psu.TemperatureC != nil {
|
||||||
|
fmt.Fprintf(&b, " Temp: %.1f°C\n", *psu.TemperatureC)
|
||||||
|
}
|
||||||
|
if i < len(snap.Hardware.PowerSupplies)-1 {
|
||||||
|
fmt.Fprintln(&b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "PSU", Body: strings.TrimSpace(b.String())}
|
||||||
|
}
|
||||||
|
|
||||||
|
// satStatuses reads the latest summary.txt for each SAT type and returns
|
||||||
|
// a map of component key ("gpu","memory","storage") → status ("PASS","FAIL","CANCEL","N/A").
|
||||||
|
func satStatuses() map[string]string {
|
||||||
|
result := map[string]string{
|
||||||
|
"gpu": "N/A",
|
||||||
|
"memory": "N/A",
|
||||||
|
"storage": "N/A",
|
||||||
|
"cpu": "N/A",
|
||||||
|
}
|
||||||
|
patterns := []struct {
|
||||||
|
key string
|
||||||
|
prefix string
|
||||||
|
}{
|
||||||
|
{"gpu", "gpu-nvidia-"},
|
||||||
|
{"gpu", "gpu-amd-"},
|
||||||
|
{"memory", "memory-"},
|
||||||
|
{"storage", "storage-"},
|
||||||
|
{"cpu", "cpu-"},
|
||||||
|
}
|
||||||
|
for _, item := range patterns {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
||||||
|
if err != nil || len(matches) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
values := parseKeyValueSummary(string(raw))
|
||||||
|
switch strings.ToUpper(strings.TrimSpace(values["overall_status"])) {
|
||||||
|
case "OK":
|
||||||
|
result[item.key] = "PASS"
|
||||||
|
case "FAILED":
|
||||||
|
result[item.key] = "FAIL"
|
||||||
|
case "CANCELED", "CANCELLED":
|
||||||
|
result[item.key] = "CANCEL"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatPSULine(psus []schema.HardwarePowerSupply) string {
|
||||||
|
var present []schema.HardwarePowerSupply
|
||||||
|
for _, psu := range psus {
|
||||||
|
if psu.Present != nil && !*psu.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
present = append(present, psu)
|
||||||
|
}
|
||||||
|
if len(present) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
firstW := 0
|
||||||
|
if present[0].WattageW != nil {
|
||||||
|
firstW = *present[0].WattageW
|
||||||
|
}
|
||||||
|
allSame := firstW > 0
|
||||||
|
for _, p := range present[1:] {
|
||||||
|
w := 0
|
||||||
|
if p.WattageW != nil {
|
||||||
|
w = *p.WattageW
|
||||||
|
}
|
||||||
|
if w != firstW {
|
||||||
|
allSame = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if allSame && firstW > 0 {
|
||||||
|
return fmt.Sprintf("%dx %dW", len(present), firstW)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%d PSU", len(present))
|
||||||
|
}
|
||||||
214
audit/internal/app/sat_overlay.go
Normal file
214
audit/internal/app/sat_overlay.go
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||||
|
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if summary, ok := loadLatestSATSummary(baseDir, "gpu-amd-"); ok {
|
||||||
|
applyGPUVendorSAT(snap.PCIeDevices, "amd", summary)
|
||||||
|
}
|
||||||
|
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||||
|
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||||
|
}
|
||||||
|
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||||
|
applyMemorySAT(snap.Memory, summary)
|
||||||
|
}
|
||||||
|
if summary, ok := loadLatestSATSummary(baseDir, "cpu-"); ok {
|
||||||
|
applyCPUSAT(snap.CPUs, summary)
|
||||||
|
}
|
||||||
|
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||||
|
applyStorageSAT(snap.Storage, summary)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type satSummary struct {
|
||||||
|
runAtUTC string
|
||||||
|
overall string
|
||||||
|
kv map[string]string
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadLatestSATSummary(baseDir, prefix string) (satSummary, bool) {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(baseDir, prefix+"*/summary.txt"))
|
||||||
|
if err != nil || len(matches) == 0 {
|
||||||
|
return satSummary{}, false
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||||
|
if err != nil {
|
||||||
|
return satSummary{}, false
|
||||||
|
}
|
||||||
|
kv := parseKeyValueSummary(string(raw))
|
||||||
|
return satSummary{
|
||||||
|
runAtUTC: strings.TrimSpace(kv["run_at_utc"]),
|
||||||
|
overall: strings.ToUpper(strings.TrimSpace(kv["overall_status"])),
|
||||||
|
kv: kv,
|
||||||
|
}, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyGPUVendorSAT(devs []schema.HardwarePCIeDevice, vendor string, summary satSummary) {
|
||||||
|
status, description, ok := satSummaryStatus(summary, vendor+" GPU SAT")
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for i := range devs {
|
||||||
|
if !matchesGPUVendor(devs[i], vendor) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
mergeComponentStatus(&devs[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyMemorySAT(dimms []schema.HardwareMemory, summary satSummary) {
|
||||||
|
status, description, ok := satSummaryStatus(summary, "memory SAT")
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for i := range dimms {
|
||||||
|
mergeComponentStatus(&dimms[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyCPUSAT(cpus []schema.HardwareCPU, summary satSummary) {
|
||||||
|
status, description, ok := satSummaryStatus(summary, "CPU SAT")
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for i := range cpus {
|
||||||
|
mergeComponentStatus(&cpus[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyStorageSAT(disks []schema.HardwareStorage, summary satSummary) {
|
||||||
|
byDevice := parseStorageSATStatus(summary)
|
||||||
|
for i := range disks {
|
||||||
|
devPath, _ := disks[i].Telemetry["linux_device"].(string)
|
||||||
|
devName := filepath.Base(strings.TrimSpace(devPath))
|
||||||
|
if devName == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result, ok := byDevice[devName]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
mergeComponentStatus(&disks[i].HardwareComponentStatus, summary.runAtUTC, result.status, result.description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type satStatusResult struct {
|
||||||
|
status string
|
||||||
|
description string
|
||||||
|
ok bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseStorageSATStatus(summary satSummary) map[string]satStatusResult {
|
||||||
|
result := map[string]satStatusResult{}
|
||||||
|
for key, value := range summary.kv {
|
||||||
|
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
base := strings.TrimSuffix(key, "_status")
|
||||||
|
idx := strings.Index(base, "_")
|
||||||
|
if idx <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devName := base[:idx]
|
||||||
|
step := strings.ReplaceAll(base[idx+1:], "_", "-")
|
||||||
|
stepStatus, desc, ok := satKeyStatus(strings.ToUpper(strings.TrimSpace(value)), "storage "+step)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
current := result[devName]
|
||||||
|
if !current.ok || statusSeverity(stepStatus) > statusSeverity(current.status) {
|
||||||
|
result[devName] = satStatusResult{status: stepStatus, description: desc, ok: true}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
|
||||||
|
return satKeyStatus(summary.overall, label)
|
||||||
|
}
|
||||||
|
|
||||||
|
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
||||||
|
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
||||||
|
case "OK":
|
||||||
|
return "OK", label + " passed", true
|
||||||
|
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
||||||
|
return "Warning", label + " incomplete", true
|
||||||
|
case "FAILED":
|
||||||
|
return "Critical", label + " failed", true
|
||||||
|
default:
|
||||||
|
return "", "", false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||||
|
if component == nil || satStatus == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
current := strings.TrimSpace(ptrString(component.Status))
|
||||||
|
if current == "" || current == "Unknown" || statusSeverity(satStatus) > statusSeverity(current) {
|
||||||
|
component.Status = appStringPtr(satStatus)
|
||||||
|
if strings.TrimSpace(description) != "" {
|
||||||
|
component.ErrorDescription = appStringPtr(description)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(changedAt) != "" {
|
||||||
|
component.StatusChangedAt = appStringPtr(changedAt)
|
||||||
|
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||||
|
Status: satStatus,
|
||||||
|
ChangedAt: changedAt,
|
||||||
|
Details: appStringPtr(description),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func statusSeverity(status string) int {
|
||||||
|
switch strings.TrimSpace(status) {
|
||||||
|
case "Critical":
|
||||||
|
return 3
|
||||||
|
case "Warning":
|
||||||
|
return 2
|
||||||
|
case "OK":
|
||||||
|
return 1
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||||
|
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
||||||
|
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
||||||
|
switch vendor {
|
||||||
|
case "amd":
|
||||||
|
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
||||||
|
case "nvidia":
|
||||||
|
return strings.Contains(manufacturer, "nvidia")
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ptrString(v *string) string {
|
||||||
|
if v == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return *v
|
||||||
|
}
|
||||||
|
|
||||||
|
func appStringPtr(value string) *string {
|
||||||
|
return &value
|
||||||
|
}
|
||||||
61
audit/internal/app/sat_overlay_test.go
Normal file
61
audit/internal/app/sat_overlay_test.go
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||||
|
baseDir := t.TempDir()
|
||||||
|
runDir := filepath.Join(baseDir, "storage-20260325-161151")
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
raw := "run_at_utc=2026-03-25T16:11:51Z\nnvme0n1_nvme_smart_log_status=OK\nsda_smartctl_health_status=FAILED\noverall_status=FAILED\n"
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
nvme := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/nvme0n1"}}
|
||||||
|
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||||
|
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||||
|
|
||||||
|
applyLatestSATStatuses(&snap, baseDir)
|
||||||
|
|
||||||
|
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||||
|
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||||
|
}
|
||||||
|
if snap.Storage[1].Status == nil || *snap.Storage[1].Status != "Critical" {
|
||||||
|
t.Fatalf("sda status=%v want Critical", snap.Storage[1].Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||||
|
baseDir := t.TempDir()
|
||||||
|
runDir := filepath.Join(baseDir, "gpu-amd-20260325-161436")
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
raw := "run_at_utc=2026-03-25T16:14:36Z\noverall_status=FAILED\n"
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
class := "DisplayController"
|
||||||
|
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||||
|
snap := schema.HardwareSnapshot{
|
||||||
|
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||||
|
DeviceClass: &class,
|
||||||
|
Manufacturer: &manufacturer,
|
||||||
|
}},
|
||||||
|
}
|
||||||
|
|
||||||
|
applyLatestSATStatuses(&snap, baseDir)
|
||||||
|
|
||||||
|
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||||
|
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
364
audit/internal/app/support_bundle.go
Normal file
364
audit/internal/app/support_bundle.go
Normal file
@@ -0,0 +1,364 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/tar"
|
||||||
|
"compress/gzip"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var supportBundleServices = []string{
|
||||||
|
"bee-audit.service",
|
||||||
|
"bee-web.service",
|
||||||
|
"bee-network.service",
|
||||||
|
"bee-nvidia.service",
|
||||||
|
"bee-preflight.service",
|
||||||
|
"bee-sshsetup.service",
|
||||||
|
}
|
||||||
|
|
||||||
|
var supportBundleCommands = []struct {
|
||||||
|
name string
|
||||||
|
cmd []string
|
||||||
|
}{
|
||||||
|
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
||||||
|
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
||||||
|
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||||
|
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||||
|
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||||
|
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||||
|
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||||
|
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
||||||
|
}
|
||||||
|
|
||||||
|
func BuildSupportBundle(exportDir string) (string, error) {
|
||||||
|
exportDir = strings.TrimSpace(exportDir)
|
||||||
|
if exportDir == "" {
|
||||||
|
exportDir = DefaultExportDir
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if err := cleanupOldSupportBundles(os.TempDir()); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
host := sanitizeFilename(hostnameOr("unknown"))
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
|
||||||
|
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(stageRoot)
|
||||||
|
|
||||||
|
if err := copyExportDirForSupportBundle(exportDir, filepath.Join(stageRoot, "export")); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
for _, svc := range supportBundleServices {
|
||||||
|
if err := writeCommandOutput(filepath.Join(stageRoot, "systemd", svc+".status.txt"), []string{"systemctl", "status", svc, "--no-pager"}); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if err := writeCommandOutput(filepath.Join(stageRoot, "systemd", svc+".journal.log"), []string{"journalctl", "--no-pager", "-u", svc}); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, item := range supportBundleCommands {
|
||||||
|
if err := writeCommandOutput(filepath.Join(stageRoot, item.name), item.cmd); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
|
||||||
|
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return archivePath, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func cleanupOldSupportBundles(dir string) error {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
type entry struct {
|
||||||
|
path string
|
||||||
|
mod time.Time
|
||||||
|
}
|
||||||
|
list := make([]entry, 0, len(matches))
|
||||||
|
for _, match := range matches {
|
||||||
|
info, err := os.Stat(match)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if time.Since(info.ModTime()) > 24*time.Hour {
|
||||||
|
_ = os.Remove(match)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
list = append(list, entry{path: match, mod: info.ModTime()})
|
||||||
|
}
|
||||||
|
sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
|
||||||
|
if len(list) > 3 {
|
||||||
|
for _, old := range list[3:] {
|
||||||
|
_ = os.Remove(old.path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeJournalDump(dst string) error {
|
||||||
|
args := []string{"--no-pager"}
|
||||||
|
for _, svc := range supportBundleServices {
|
||||||
|
args = append(args, "-u", svc)
|
||||||
|
}
|
||||||
|
raw, err := exec.Command("journalctl", args...).CombinedOutput()
|
||||||
|
if len(raw) == 0 && err != nil {
|
||||||
|
raw = []byte(err.Error() + "\n")
|
||||||
|
}
|
||||||
|
if len(raw) == 0 {
|
||||||
|
raw = []byte("no journal output\n")
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(dst, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeCommandOutput(dst string, cmd []string) error {
|
||||||
|
if len(cmd) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
raw, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
|
||||||
|
if len(raw) == 0 {
|
||||||
|
if err != nil {
|
||||||
|
raw = []byte(err.Error() + "\n")
|
||||||
|
} else {
|
||||||
|
raw = []byte("no output\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(dst, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeManifest(dst, exportDir, stageRoot string) error {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
fmt.Fprintf(&body, "bee_version=%s\n", buildVersion())
|
||||||
|
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
||||||
|
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
|
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
||||||
|
fmt.Fprintf(&body, "\nfiles:\n")
|
||||||
|
|
||||||
|
var files []string
|
||||||
|
if err := filepath.Walk(stageRoot, func(path string, info os.FileInfo, err error) error {
|
||||||
|
if err != nil || info.IsDir() {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if filepath.Clean(path) == filepath.Clean(dst) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(stageRoot, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
files = append(files, fmt.Sprintf("%s\t%d", rel, info.Size()))
|
||||||
|
return nil
|
||||||
|
}); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
sort.Strings(files)
|
||||||
|
for _, line := range files {
|
||||||
|
body.WriteString(line)
|
||||||
|
body.WriteByte('\n')
|
||||||
|
}
|
||||||
|
return os.WriteFile(dst, []byte(body.String()), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildVersion() string {
|
||||||
|
raw, err := exec.Command("bee", "version").CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(raw))
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyDirContents(srcDir, dstDir string) error {
|
||||||
|
entries, err := os.ReadDir(srcDir)
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, entry := range entries {
|
||||||
|
src := filepath.Join(srcDir, entry.Name())
|
||||||
|
dst := filepath.Join(dstDir, entry.Name())
|
||||||
|
if err := copyPath(src, dst); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||||
|
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||||
|
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||||
|
if cleanRel == "" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(cleanRel, "bee-sat/") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(filepath.Base(cleanRel), "bee-support-") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||||
|
entries, err := os.ReadDir(srcDir)
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, entry := range entries {
|
||||||
|
src := filepath.Join(srcDir, entry.Name())
|
||||||
|
dst := filepath.Join(dstDir, entry.Name())
|
||||||
|
if err := copyPathFiltered(srcDir, src, dst, keep); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyPath(src, dst string) error {
|
||||||
|
info, err := os.Stat(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
entries, err := os.ReadDir(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, entry := range entries {
|
||||||
|
if err := copyPath(filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name())); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
in, err := os.Open(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer in.Close()
|
||||||
|
|
||||||
|
out, err := os.OpenFile(dst, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, info.Mode().Perm())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer out.Close()
|
||||||
|
|
||||||
|
_, err = io.Copy(out, in)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyPathFiltered(rootSrc, src, dst string, keep func(rel string, info os.FileInfo) bool) error {
|
||||||
|
info, err := os.Stat(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(rootSrc, src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if keep != nil && !keep(rel, info) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
entries, err := os.ReadDir(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, entry := range entries {
|
||||||
|
if err := copyPathFiltered(rootSrc, filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name()), keep); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return copyPath(src, dst)
|
||||||
|
}
|
||||||
|
|
||||||
|
func createSupportTarGz(dst, srcDir string) error {
|
||||||
|
file, err := os.Create(dst)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
gz := gzip.NewWriter(file)
|
||||||
|
defer gz.Close()
|
||||||
|
|
||||||
|
tw := tar.NewWriter(gz)
|
||||||
|
defer tw.Close()
|
||||||
|
|
||||||
|
base := filepath.Dir(srcDir)
|
||||||
|
return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
header, err := tar.FileInfoHeader(info, "")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
header.Name, err = filepath.Rel(base, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := tw.WriteHeader(header); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
_, err = io.Copy(tw, f)
|
||||||
|
return err
|
||||||
|
})
|
||||||
|
}
|
||||||
252
audit/internal/collector/amdgpu.go
Normal file
252
audit/internal/collector/amdgpu.go
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/csv"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
amdSMIExecCommand = exec.Command
|
||||||
|
amdSMILookPath = exec.LookPath
|
||||||
|
amdSMIGlob = filepath.Glob
|
||||||
|
)
|
||||||
|
|
||||||
|
var amdSMIExecutableGlobs = []string{
|
||||||
|
"/opt/rocm/bin/rocm-smi",
|
||||||
|
"/opt/rocm-*/bin/rocm-smi",
|
||||||
|
"/usr/local/bin/rocm-smi",
|
||||||
|
}
|
||||||
|
|
||||||
|
type amdGPUInfo struct {
|
||||||
|
BDF string
|
||||||
|
Serial string
|
||||||
|
Product string
|
||||||
|
Firmware string
|
||||||
|
PowerW *float64
|
||||||
|
TempC *float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func enrichPCIeWithAMD(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||||
|
if !hasAMDGPUDevices(devs) {
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
infoByBDF, err := queryAMDGPUs()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("amdgpu: enrichment skipped", "err", err)
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
enriched := 0
|
||||||
|
for i := range devs {
|
||||||
|
if !isAMDGPUDevice(devs[i]) || devs[i].BDF == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
info, ok := infoByBDF[normalizePCIeBDF(*devs[i].BDF)]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(info.Serial) != "" {
|
||||||
|
devs[i].SerialNumber = &info.Serial
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(info.Firmware) != "" {
|
||||||
|
devs[i].Firmware = &info.Firmware
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(info.Product) != "" && devs[i].Model == nil {
|
||||||
|
devs[i].Model = &info.Product
|
||||||
|
}
|
||||||
|
if info.PowerW != nil {
|
||||||
|
devs[i].PowerW = info.PowerW
|
||||||
|
}
|
||||||
|
if info.TempC != nil {
|
||||||
|
devs[i].TemperatureC = info.TempC
|
||||||
|
}
|
||||||
|
enriched++
|
||||||
|
}
|
||||||
|
if enriched > 0 {
|
||||||
|
slog.Info("amdgpu: enriched", "count", enriched)
|
||||||
|
}
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
||||||
|
for _, dev := range devs {
|
||||||
|
if isAMDGPUDevice(dev) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||||
|
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
||||||
|
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||||
|
busByCard, err := queryAMDField("--showbus")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
infoByCard := map[string]amdGPUInfo{}
|
||||||
|
for card, bus := range busByCard {
|
||||||
|
bdf := normalizePCIeBDF(bus)
|
||||||
|
if bdf == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
infoByCard[card] = amdGPUInfo{BDF: bdf}
|
||||||
|
}
|
||||||
|
if len(infoByCard) == 0 {
|
||||||
|
return map[string]amdGPUInfo{}, nil
|
||||||
|
}
|
||||||
|
mergeAMDField(infoByCard, "--showserial", func(info *amdGPUInfo, value string) { info.Serial = value })
|
||||||
|
mergeAMDField(infoByCard, "--showproductname", func(info *amdGPUInfo, value string) { info.Product = value })
|
||||||
|
mergeAMDField(infoByCard, "--showvbios", func(info *amdGPUInfo, value string) { info.Firmware = value })
|
||||||
|
mergeAMDNumericField(infoByCard, "--showpower", func(info *amdGPUInfo, value float64) { info.PowerW = &value })
|
||||||
|
mergeAMDNumericField(infoByCard, "--showtemp", func(info *amdGPUInfo, value float64) { info.TempC = &value })
|
||||||
|
|
||||||
|
result := make(map[string]amdGPUInfo, len(infoByCard))
|
||||||
|
for _, info := range infoByCard {
|
||||||
|
if info.BDF == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result[info.BDF] = info
|
||||||
|
}
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeAMDField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, string)) {
|
||||||
|
values, err := queryAMDField(flag)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for card, value := range values {
|
||||||
|
info, ok := infoByCard[card]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value = strings.TrimSpace(value)
|
||||||
|
if value == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
apply(&info, value)
|
||||||
|
infoByCard[card] = info
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeAMDNumericField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, float64)) {
|
||||||
|
values, err := queryAMDNumericField(flag)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for card, value := range values {
|
||||||
|
info, ok := infoByCard[card]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
apply(&info, value)
|
||||||
|
infoByCard[card] = info
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryAMDField(flag string) (map[string]string, error) {
|
||||||
|
cmd, err := resolveAMDSMICmd(flag, "--csv")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
out, err := amdSMIExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return parseROCmSingleValueCSV(string(out)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryAMDNumericField(flag string) (map[string]float64, error) {
|
||||||
|
values, err := queryAMDField(flag)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
out := map[string]float64{}
|
||||||
|
for card, raw := range values {
|
||||||
|
if value, ok := firstFloat(raw); ok {
|
||||||
|
out[card] = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveAMDSMICmd(args ...string) ([]string, error) {
|
||||||
|
if path, err := amdSMILookPath("rocm-smi"); err == nil {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
for _, pattern := range amdSMIExecutableGlobs {
|
||||||
|
matches, err := amdSMIGlob(pattern)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
for _, match := range matches {
|
||||||
|
return append([]string{match}, args...), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, exec.ErrNotFound
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseROCmSingleValueCSV(raw string) map[string]string {
|
||||||
|
rows := map[string]string{}
|
||||||
|
reader := csv.NewReader(strings.NewReader(raw))
|
||||||
|
reader.FieldsPerRecord = -1
|
||||||
|
records, err := reader.ReadAll()
|
||||||
|
if err != nil {
|
||||||
|
return rows
|
||||||
|
}
|
||||||
|
for _, rec := range records {
|
||||||
|
if len(rec) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
card := normalizeROCmCardKey(rec[0])
|
||||||
|
if card == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value := strings.TrimSpace(strings.Join(rec[1:], ","))
|
||||||
|
if value == "" || looksLikeCSVHeaderValue(value) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rows[card] = value
|
||||||
|
}
|
||||||
|
return rows
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeROCmCardKey(raw string) string {
|
||||||
|
raw = strings.ToLower(strings.TrimSpace(raw))
|
||||||
|
raw = strings.Trim(raw, "\"")
|
||||||
|
if raw == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if raw == "device" || raw == "gpu" || raw == "card" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(raw, "card") {
|
||||||
|
return raw
|
||||||
|
}
|
||||||
|
if _, err := strconv.Atoi(raw); err == nil {
|
||||||
|
return "card" + raw
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func looksLikeCSVHeaderValue(value string) bool {
|
||||||
|
value = strings.ToLower(strings.TrimSpace(value))
|
||||||
|
return strings.Contains(value, "product") ||
|
||||||
|
strings.Contains(value, "serial") ||
|
||||||
|
strings.Contains(value, "vbios") ||
|
||||||
|
strings.Contains(value, "bus")
|
||||||
|
}
|
||||||
56
audit/internal/collector/amdgpu_test.go
Normal file
56
audit/internal/collector/amdgpu_test.go
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os/exec"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseROCmSingleValueCSV(t *testing.T) {
|
||||||
|
raw := "device,Serial Number\ncard0,ABC123\ncard1,XYZ789\n"
|
||||||
|
got := parseROCmSingleValueCSV(raw)
|
||||||
|
if got["card0"] != "ABC123" {
|
||||||
|
t.Fatalf("card0=%q want ABC123", got["card0"])
|
||||||
|
}
|
||||||
|
if got["card1"] != "XYZ789" {
|
||||||
|
t.Fatalf("card1=%q want XYZ789", got["card1"])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQueryAMDNumericFieldParsesUnits(t *testing.T) {
|
||||||
|
origExec := amdSMIExecCommand
|
||||||
|
origLookPath := amdSMILookPath
|
||||||
|
t.Cleanup(func() {
|
||||||
|
amdSMIExecCommand = origExec
|
||||||
|
amdSMILookPath = origLookPath
|
||||||
|
})
|
||||||
|
|
||||||
|
amdSMILookPath = func(string) (string, error) { return "/usr/bin/rocm-smi", nil }
|
||||||
|
amdSMIExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
return exec.Command("sh", "-c", "printf 'device,Temperature\\ncard0,45.5c\\ncard1,67.0c\\n'")
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := queryAMDNumericField("--showtemp")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("queryAMDNumericField: %v", err)
|
||||||
|
}
|
||||||
|
if got["card0"] != 45.5 {
|
||||||
|
t.Fatalf("card0=%v want 45.5", got["card0"])
|
||||||
|
}
|
||||||
|
if got["card1"] != 67.0 {
|
||||||
|
t.Fatalf("card1=%v want 67.0", got["card1"])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeROCmCardKey(t *testing.T) {
|
||||||
|
tests := map[string]string{
|
||||||
|
"0": "card0",
|
||||||
|
"card1": "card1",
|
||||||
|
"Device": "",
|
||||||
|
"": "",
|
||||||
|
}
|
||||||
|
for input, want := range tests {
|
||||||
|
if got := normalizeROCmCardKey(input); got != want {
|
||||||
|
t.Fatalf("normalizeROCmCardKey(%q)=%q want %q", input, got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,10 +4,27 @@ import (
|
|||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
"bufio"
|
"bufio"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var execDmidecode = func(typeNum string) (string, error) {
|
||||||
|
out, err := exec.Command("dmidecode", "-t", typeNum).Output()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return string(out), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var execIpmitool = func(args ...string) (string, error) {
|
||||||
|
out, err := exec.Command("ipmitool", args...).Output()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return string(out), nil
|
||||||
|
}
|
||||||
|
|
||||||
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
||||||
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
||||||
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
||||||
@@ -61,6 +78,45 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
|
|||||||
return board
|
return board
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
||||||
|
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
||||||
|
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
|
||||||
|
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out, err := execIpmitool("mc", "info")
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
version := parseBMCFirmwareRevision(out)
|
||||||
|
if version == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
slog.Info("bmc: collected", "version", version)
|
||||||
|
return []schema.HardwareFirmwareRecord{
|
||||||
|
{DeviceName: "BMC", Version: version},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseBMCFirmwareRevision extracts the "Firmware Revision" field from ipmitool mc info output.
|
||||||
|
func parseBMCFirmwareRevision(out string) string {
|
||||||
|
for _, line := range strings.Split(out, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
key, val, ok := strings.Cut(line, ":")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(key) == "Firmware Revision" {
|
||||||
|
return strings.TrimSpace(val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
// parseBIOSFirmware extracts BIOS version from dmidecode type 0 output.
|
// parseBIOSFirmware extracts BIOS version from dmidecode type 0 output.
|
||||||
func parseBIOSFirmware(type0 string) []schema.HardwareFirmwareRecord {
|
func parseBIOSFirmware(type0 string) []schema.HardwareFirmwareRecord {
|
||||||
fields := parseDMIFields(type0, "BIOS Information")
|
fields := parseDMIFields(type0, "BIOS Information")
|
||||||
@@ -141,9 +197,5 @@ func cleanDMIValue(v string) string {
|
|||||||
|
|
||||||
// runDmidecode executes dmidecode -t <typeNum> and returns its stdout.
|
// runDmidecode executes dmidecode -t <typeNum> and returns its stdout.
|
||||||
func runDmidecode(typeNum string) (string, error) {
|
func runDmidecode(typeNum string) (string, error) {
|
||||||
out, err := exec.Command("dmidecode", "-t", typeNum).Output()
|
return execDmidecode(typeNum)
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return string(out), nil
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,15 +4,18 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bee/audit/internal/runtimeenv"
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Run executes all collectors and returns the combined snapshot.
|
// Run executes all collectors and returns the combined snapshot.
|
||||||
// Partial failures are logged as warnings; collection always completes.
|
// Partial failures are logged as warnings; collection always completes.
|
||||||
func Run() schema.HardwareIngestRequest {
|
func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
|
collectedAt := time.Now().UTC().Format(time.RFC3339)
|
||||||
slog.Info("audit started")
|
slog.Info("audit started")
|
||||||
|
|
||||||
snap := schema.HardwareSnapshot{}
|
snap := schema.HardwareSnapshot{}
|
||||||
@@ -20,32 +23,45 @@ func Run() schema.HardwareIngestRequest {
|
|||||||
board, biosFW := collectBoard()
|
board, biosFW := collectBoard()
|
||||||
snap.Board = board
|
snap.Board = board
|
||||||
snap.Firmware = append(snap.Firmware, biosFW...)
|
snap.Firmware = append(snap.Firmware, biosFW...)
|
||||||
|
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
|
||||||
|
|
||||||
cpus, cpuFW := collectCPUs(snap.Board.SerialNumber)
|
snap.CPUs = collectCPUs()
|
||||||
snap.CPUs = cpus
|
|
||||||
snap.Firmware = append(snap.Firmware, cpuFW...)
|
|
||||||
|
|
||||||
snap.Memory = collectMemory()
|
snap.Memory = collectMemory()
|
||||||
|
sensorDoc, err := readSensorsJSONDoc()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("sensors: unavailable for enrichment", "err", err)
|
||||||
|
}
|
||||||
|
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
|
||||||
|
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||||
snap.Storage = collectStorage()
|
snap.Storage = collectStorage()
|
||||||
snap.PCIeDevices = collectPCIe()
|
snap.PCIeDevices = collectPCIe()
|
||||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices, snap.Board.SerialNumber)
|
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||||
|
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||||
|
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||||
|
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||||||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||||||
snap.PowerSupplies = collectPSUs()
|
snap.PowerSupplies = collectPSUs()
|
||||||
|
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||||
|
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||||
|
finalizeSnapshot(&snap, collectedAt)
|
||||||
|
|
||||||
// remaining collectors added in steps 1.8 – 1.10
|
// remaining collectors added in steps 1.8 – 1.10
|
||||||
|
|
||||||
slog.Info("audit completed", "duration", time.Since(start).Round(time.Millisecond))
|
slog.Info("audit completed", "duration", time.Since(start).Round(time.Millisecond))
|
||||||
|
|
||||||
sourceType := "livcd"
|
sourceType := "manual"
|
||||||
protocol := "os-direct"
|
var targetHost *string
|
||||||
|
if hostname, err := os.Hostname(); err == nil && hostname != "" {
|
||||||
|
targetHost = &hostname
|
||||||
|
}
|
||||||
return schema.HardwareIngestRequest{
|
return schema.HardwareIngestRequest{
|
||||||
SourceType: &sourceType,
|
SourceType: &sourceType,
|
||||||
Protocol: &protocol,
|
TargetHost: targetHost,
|
||||||
CollectedAt: time.Now().UTC().Format(time.RFC3339),
|
CollectedAt: collectedAt,
|
||||||
Hardware: snap,
|
Hardware: snap,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
64
audit/internal/collector/contract.go
Normal file
64
audit/internal/collector/contract.go
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
const (
|
||||||
|
statusOK = "OK"
|
||||||
|
statusWarning = "Warning"
|
||||||
|
statusCritical = "Critical"
|
||||||
|
statusUnknown = "Unknown"
|
||||||
|
statusEmpty = "Empty"
|
||||||
|
)
|
||||||
|
|
||||||
|
func mapPCIeDeviceClass(raw string) string {
|
||||||
|
normalized := strings.ToLower(strings.TrimSpace(raw))
|
||||||
|
switch {
|
||||||
|
case normalized == "":
|
||||||
|
return ""
|
||||||
|
case strings.Contains(normalized, "ethernet controller"):
|
||||||
|
return "EthernetController"
|
||||||
|
case strings.Contains(normalized, "fibre channel"):
|
||||||
|
return "FibreChannelController"
|
||||||
|
case strings.Contains(normalized, "network controller"), strings.Contains(normalized, "infiniband controller"):
|
||||||
|
return "NetworkController"
|
||||||
|
case strings.Contains(normalized, "serial attached scsi"), strings.Contains(normalized, "storage controller"):
|
||||||
|
return "StorageController"
|
||||||
|
case strings.Contains(normalized, "raid"), strings.Contains(normalized, "mass storage"):
|
||||||
|
return "MassStorageController"
|
||||||
|
case strings.Contains(normalized, "display controller"):
|
||||||
|
return "DisplayController"
|
||||||
|
case strings.Contains(normalized, "vga"), strings.Contains(normalized, "3d controller"), strings.Contains(normalized, "video controller"):
|
||||||
|
return "VideoController"
|
||||||
|
case strings.Contains(normalized, "processing accelerators"), strings.Contains(normalized, "processing accelerator"):
|
||||||
|
return "ProcessingAccelerator"
|
||||||
|
default:
|
||||||
|
return raw
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isNICClass(class string) bool {
|
||||||
|
switch strings.TrimSpace(class) {
|
||||||
|
case "EthernetController", "NetworkController":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isGPUClass(class string) bool {
|
||||||
|
switch strings.TrimSpace(class) {
|
||||||
|
case "VideoController", "DisplayController", "ProcessingAccelerator":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isRAIDClass(class string) bool {
|
||||||
|
switch strings.TrimSpace(class) {
|
||||||
|
case "MassStorageController", "StorageController":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,42 +3,39 @@ package collector
|
|||||||
import (
|
import (
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
"bufio"
|
"bufio"
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
// collectCPUs runs dmidecode -t 4 and reads microcode version from sysfs.
|
// collectCPUs runs dmidecode -t 4 and enriches CPUs with microcode from sysfs.
|
||||||
func collectCPUs(boardSerial string) ([]schema.HardwareCPU, []schema.HardwareFirmwareRecord) {
|
func collectCPUs() []schema.HardwareCPU {
|
||||||
out, err := runDmidecode("4")
|
out, err := runDmidecode("4")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("cpu: dmidecode type 4 failed", "err", err)
|
slog.Warn("cpu: dmidecode type 4 failed", "err", err)
|
||||||
return nil, nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
cpus := parseCPUs(out, boardSerial)
|
cpus := parseCPUs(out)
|
||||||
|
|
||||||
var firmware []schema.HardwareFirmwareRecord
|
|
||||||
if mc := readMicrocode(); mc != "" {
|
if mc := readMicrocode(); mc != "" {
|
||||||
firmware = append(firmware, schema.HardwareFirmwareRecord{
|
for i := range cpus {
|
||||||
DeviceName: "CPU Microcode",
|
cpus[i].Firmware = &mc
|
||||||
Version: mc,
|
}
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("cpu: collected", "count", len(cpus))
|
slog.Info("cpu: collected", "count", len(cpus))
|
||||||
return cpus, firmware
|
return cpus
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseCPUs splits dmidecode output into per-processor sections and parses each.
|
// parseCPUs splits dmidecode output into per-processor sections and parses each.
|
||||||
func parseCPUs(output, boardSerial string) []schema.HardwareCPU {
|
func parseCPUs(output string) []schema.HardwareCPU {
|
||||||
sections := splitDMISections(output, "Processor Information")
|
sections := splitDMISections(output, "Processor Information")
|
||||||
cpus := make([]schema.HardwareCPU, 0, len(sections))
|
cpus := make([]schema.HardwareCPU, 0, len(sections))
|
||||||
|
|
||||||
for _, section := range sections {
|
for _, section := range sections {
|
||||||
cpu, ok := parseCPUSection(section, boardSerial)
|
cpu, ok := parseCPUSection(section)
|
||||||
if !ok {
|
if !ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -49,14 +46,16 @@ func parseCPUs(output, boardSerial string) []schema.HardwareCPU {
|
|||||||
|
|
||||||
// parseCPUSection parses one "Processor Information" block into a HardwareCPU.
|
// parseCPUSection parses one "Processor Information" block into a HardwareCPU.
|
||||||
// Returns false if the socket is unpopulated.
|
// Returns false if the socket is unpopulated.
|
||||||
func parseCPUSection(fields map[string]string, boardSerial string) (schema.HardwareCPU, bool) {
|
func parseCPUSection(fields map[string]string) (schema.HardwareCPU, bool) {
|
||||||
status := parseCPUStatus(fields["Status"])
|
status := parseCPUStatus(fields["Status"])
|
||||||
if status == "EMPTY" {
|
if status == statusEmpty {
|
||||||
return schema.HardwareCPU{}, false
|
return schema.HardwareCPU{}, false
|
||||||
}
|
}
|
||||||
|
|
||||||
cpu := schema.HardwareCPU{}
|
cpu := schema.HardwareCPU{}
|
||||||
cpu.Status = &status
|
cpu.Status = &status
|
||||||
|
present := true
|
||||||
|
cpu.Present = &present
|
||||||
|
|
||||||
if socket, ok := parseSocketIndex(fields["Socket Designation"]); ok {
|
if socket, ok := parseSocketIndex(fields["Socket Designation"]); ok {
|
||||||
cpu.Socket = &socket
|
cpu.Socket = &socket
|
||||||
@@ -70,11 +69,6 @@ func parseCPUSection(fields map[string]string, boardSerial string) (schema.Hardw
|
|||||||
}
|
}
|
||||||
if v := cleanDMIValue(fields["Serial Number"]); v != "" {
|
if v := cleanDMIValue(fields["Serial Number"]); v != "" {
|
||||||
cpu.SerialNumber = &v
|
cpu.SerialNumber = &v
|
||||||
} else if boardSerial != "" && cpu.Socket != nil {
|
|
||||||
// Intel Xeon never exposes serial via DMI — generate stable fallback
|
|
||||||
// matching core's generateCPUVendorSerial() logic
|
|
||||||
fb := fmt.Sprintf("%s-CPU-%d", boardSerial, *cpu.Socket)
|
|
||||||
cpu.SerialNumber = &fb
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if v := parseMHz(fields["Max Speed"]); v > 0 {
|
if v := parseMHz(fields["Max Speed"]); v > 0 {
|
||||||
@@ -99,15 +93,15 @@ func parseCPUStatus(raw string) string {
|
|||||||
upper := strings.ToUpper(raw)
|
upper := strings.ToUpper(raw)
|
||||||
switch {
|
switch {
|
||||||
case upper == "" || upper == "UNKNOWN":
|
case upper == "" || upper == "UNKNOWN":
|
||||||
return "UNKNOWN"
|
return statusUnknown
|
||||||
case strings.Contains(upper, "UNPOPULATED") || strings.Contains(upper, "NOT POPULATED"):
|
case strings.Contains(upper, "UNPOPULATED") || strings.Contains(upper, "NOT POPULATED"):
|
||||||
return "EMPTY"
|
return statusEmpty
|
||||||
case strings.Contains(upper, "ENABLED"):
|
case strings.Contains(upper, "ENABLED"):
|
||||||
return "OK"
|
return statusOK
|
||||||
case strings.Contains(upper, "DISABLED"):
|
case strings.Contains(upper, "DISABLED"):
|
||||||
return "WARNING"
|
return statusWarning
|
||||||
default:
|
default:
|
||||||
return "UNKNOWN"
|
return statusUnknown
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -178,7 +172,7 @@ func parseInt(v string) int {
|
|||||||
// readMicrocode reads the CPU microcode revision from sysfs.
|
// readMicrocode reads the CPU microcode revision from sysfs.
|
||||||
// Returns empty string if unavailable.
|
// Returns empty string if unavailable.
|
||||||
func readMicrocode() string {
|
func readMicrocode() string {
|
||||||
data, err := os.ReadFile("/sys/devices/system/cpu/cpu0/microcode/version")
|
data, err := os.ReadFile(filepath.Join(cpuSysBaseDir, "cpu0", "microcode", "version"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|||||||
196
audit/internal/collector/cpu_telemetry.go
Normal file
196
audit/internal/collector/cpu_telemetry.go
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
cpuSysBaseDir = "/sys/devices/system/cpu"
|
||||||
|
socketIndexRe = regexp.MustCompile(`(?i)(?:package id|socket|cpu)\s*([0-9]+)`)
|
||||||
|
)
|
||||||
|
|
||||||
|
func enrichCPUsWithTelemetry(cpus []schema.HardwareCPU, doc sensorsDoc) []schema.HardwareCPU {
|
||||||
|
if len(cpus) == 0 {
|
||||||
|
return cpus
|
||||||
|
}
|
||||||
|
|
||||||
|
tempBySocket := cpuTempsFromSensors(doc, len(cpus))
|
||||||
|
powerBySocket := cpuPowerFromSensors(doc, len(cpus))
|
||||||
|
throttleBySocket := cpuThrottleBySocket()
|
||||||
|
|
||||||
|
for i := range cpus {
|
||||||
|
socket := 0
|
||||||
|
if cpus[i].Socket != nil {
|
||||||
|
socket = *cpus[i].Socket
|
||||||
|
}
|
||||||
|
if value, ok := tempBySocket[socket]; ok {
|
||||||
|
cpus[i].TemperatureC = &value
|
||||||
|
}
|
||||||
|
if value, ok := powerBySocket[socket]; ok {
|
||||||
|
cpus[i].PowerW = &value
|
||||||
|
}
|
||||||
|
if value, ok := throttleBySocket[socket]; ok {
|
||||||
|
cpus[i].Throttled = &value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cpus
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuTempsFromSensors(doc sensorsDoc, cpuCount int) map[int]float64 {
|
||||||
|
out := map[int]float64{}
|
||||||
|
if len(doc) == 0 {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
var fallback []float64
|
||||||
|
for chip, features := range doc {
|
||||||
|
for featureName, raw := range features {
|
||||||
|
feature, ok := raw.(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if classifySensorFeature(feature) != "temp" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
temp, ok := firstFeatureFloat(feature, "_input")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if socket, ok := detectCPUSocket(chip, featureName); ok {
|
||||||
|
if _, exists := out[socket]; !exists {
|
||||||
|
out[socket] = temp
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if isLikelyCPUTemp(chip, featureName) {
|
||||||
|
fallback = append(fallback, temp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(out) == 0 && cpuCount == 1 && len(fallback) > 0 {
|
||||||
|
out[0] = fallback[0]
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuPowerFromSensors(doc sensorsDoc, cpuCount int) map[int]float64 {
|
||||||
|
out := map[int]float64{}
|
||||||
|
if len(doc) == 0 {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
var fallback []float64
|
||||||
|
for chip, features := range doc {
|
||||||
|
for featureName, raw := range features {
|
||||||
|
feature, ok := raw.(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if classifySensorFeature(feature) != "power" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
power, ok := firstFeatureFloatWithContains(feature, []string{"power"})
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if socket, ok := detectCPUSocket(chip, featureName); ok {
|
||||||
|
if _, exists := out[socket]; !exists {
|
||||||
|
out[socket] = power
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if isLikelyCPUPower(chip, featureName) {
|
||||||
|
fallback = append(fallback, power)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(out) == 0 && cpuCount == 1 && len(fallback) > 0 {
|
||||||
|
out[0] = fallback[0]
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func detectCPUSocket(parts ...string) (int, bool) {
|
||||||
|
for _, part := range parts {
|
||||||
|
matches := socketIndexRe.FindStringSubmatch(strings.ToLower(part))
|
||||||
|
if len(matches) == 2 {
|
||||||
|
value, err := strconv.Atoi(matches[1])
|
||||||
|
if err == nil {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyCPUTemp(chip, feature string) bool {
|
||||||
|
value := strings.ToLower(chip + " " + feature)
|
||||||
|
return strings.Contains(value, "coretemp") ||
|
||||||
|
strings.Contains(value, "k10temp") ||
|
||||||
|
strings.Contains(value, "package id") ||
|
||||||
|
strings.Contains(value, "tdie") ||
|
||||||
|
strings.Contains(value, "tctl") ||
|
||||||
|
strings.Contains(value, "cpu temp")
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyCPUPower(chip, feature string) bool {
|
||||||
|
value := strings.ToLower(chip + " " + feature)
|
||||||
|
return strings.Contains(value, "intel-rapl") ||
|
||||||
|
strings.Contains(value, "package id") ||
|
||||||
|
strings.Contains(value, "package-") ||
|
||||||
|
strings.Contains(value, "cpu power")
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuThrottleBySocket() map[int]bool {
|
||||||
|
out := map[int]bool{}
|
||||||
|
cpuDirs, err := filepath.Glob(filepath.Join(cpuSysBaseDir, "cpu[0-9]*"))
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
sort.Strings(cpuDirs)
|
||||||
|
for _, cpuDir := range cpuDirs {
|
||||||
|
socket, ok := readSocketIndex(cpuDir)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if cpuPackageThrottled(cpuDir) {
|
||||||
|
out[socket] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func readSocketIndex(cpuDir string) (int, bool) {
|
||||||
|
raw, err := os.ReadFile(filepath.Join(cpuDir, "topology", "physical_package_id"))
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
value, err := strconv.Atoi(strings.TrimSpace(string(raw)))
|
||||||
|
if err != nil || value < 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuPackageThrottled(cpuDir string) bool {
|
||||||
|
paths := []string{
|
||||||
|
filepath.Join(cpuDir, "thermal_throttle", "package_throttle_count"),
|
||||||
|
filepath.Join(cpuDir, "thermal_throttle", "core_throttle_count"),
|
||||||
|
}
|
||||||
|
for _, path := range paths {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
||||||
|
if err == nil && value > 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
71
audit/internal/collector/cpu_telemetry_test.go
Normal file
71
audit/internal/collector/cpu_telemetry_test.go
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEnrichCPUsWithTelemetry(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldBase := cpuSysBaseDir
|
||||||
|
cpuSysBaseDir = tmp
|
||||||
|
t.Cleanup(func() { cpuSysBaseDir = oldBase })
|
||||||
|
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "cpu0", "topology", "physical_package_id"), "0\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "cpu0", "thermal_throttle", "package_throttle_count"), "3\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "cpu1", "topology", "physical_package_id"), "1\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "cpu1", "thermal_throttle", "package_throttle_count"), "0\n")
|
||||||
|
|
||||||
|
doc := sensorsDoc{
|
||||||
|
"coretemp-isa-0000": {
|
||||||
|
"Package id 0": map[string]any{"temp1_input": 61.5},
|
||||||
|
"Package id 1": map[string]any{"temp2_input": 58.0},
|
||||||
|
},
|
||||||
|
"intel-rapl-mmio-0": {
|
||||||
|
"Package id 0": map[string]any{"power1_average": 180.0},
|
||||||
|
"Package id 1": map[string]any{"power2_average": 175.0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
socket0 := 0
|
||||||
|
socket1 := 1
|
||||||
|
status := statusOK
|
||||||
|
cpus := []schema.HardwareCPU{
|
||||||
|
{Socket: &socket0, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{Socket: &socket1, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := enrichCPUsWithTelemetry(cpus, doc)
|
||||||
|
|
||||||
|
if got[0].TemperatureC == nil || *got[0].TemperatureC != 61.5 {
|
||||||
|
t.Fatalf("cpu0 temperature mismatch: %#v", got[0].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[0].PowerW == nil || *got[0].PowerW != 180.0 {
|
||||||
|
t.Fatalf("cpu0 power mismatch: %#v", got[0].PowerW)
|
||||||
|
}
|
||||||
|
if got[0].Throttled == nil || !*got[0].Throttled {
|
||||||
|
t.Fatalf("cpu0 throttled mismatch: %#v", got[0].Throttled)
|
||||||
|
}
|
||||||
|
if got[1].TemperatureC == nil || *got[1].TemperatureC != 58.0 {
|
||||||
|
t.Fatalf("cpu1 temperature mismatch: %#v", got[1].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[1].PowerW == nil || *got[1].PowerW != 175.0 {
|
||||||
|
t.Fatalf("cpu1 power mismatch: %#v", got[1].PowerW)
|
||||||
|
}
|
||||||
|
if got[1].Throttled != nil && *got[1].Throttled {
|
||||||
|
t.Fatalf("cpu1 throttled mismatch: %#v", got[1].Throttled)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func mustWriteFile(t *testing.T, path, content string) {
|
||||||
|
t.Helper()
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir %s: %v", path, err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
|
||||||
|
t.Fatalf("write %s: %v", path, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,12 +1,14 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestParseCPUs_dual_socket(t *testing.T) {
|
func TestParseCPUs_dual_socket(t *testing.T) {
|
||||||
out := mustReadFile(t, "testdata/dmidecode_type4.txt")
|
out := mustReadFile(t, "testdata/dmidecode_type4.txt")
|
||||||
cpus := parseCPUs(out, "CAR315KA0803B90")
|
cpus := parseCPUs(out)
|
||||||
|
|
||||||
if len(cpus) != 2 {
|
if len(cpus) != 2 {
|
||||||
t.Fatalf("expected 2 CPUs, got %d", len(cpus))
|
t.Fatalf("expected 2 CPUs, got %d", len(cpus))
|
||||||
@@ -37,23 +39,22 @@ func TestParseCPUs_dual_socket(t *testing.T) {
|
|||||||
if cpu0.Status == nil || *cpu0.Status != "OK" {
|
if cpu0.Status == nil || *cpu0.Status != "OK" {
|
||||||
t.Errorf("cpu0 status: got %v, want OK", cpu0.Status)
|
t.Errorf("cpu0 status: got %v, want OK", cpu0.Status)
|
||||||
}
|
}
|
||||||
// Intel Xeon serial not available → fallback
|
if cpu0.SerialNumber != nil {
|
||||||
if cpu0.SerialNumber == nil || *cpu0.SerialNumber != "CAR315KA0803B90-CPU-0" {
|
t.Errorf("cpu0 serial should stay nil without source data, got %v", cpu0.SerialNumber)
|
||||||
t.Errorf("cpu0 serial fallback: got %v, want CAR315KA0803B90-CPU-0", cpu0.SerialNumber)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cpu1 := cpus[1]
|
cpu1 := cpus[1]
|
||||||
if cpu1.Socket == nil || *cpu1.Socket != 1 {
|
if cpu1.Socket == nil || *cpu1.Socket != 1 {
|
||||||
t.Errorf("cpu1 socket: got %v, want 1", cpu1.Socket)
|
t.Errorf("cpu1 socket: got %v, want 1", cpu1.Socket)
|
||||||
}
|
}
|
||||||
if cpu1.SerialNumber == nil || *cpu1.SerialNumber != "CAR315KA0803B90-CPU-1" {
|
if cpu1.SerialNumber != nil {
|
||||||
t.Errorf("cpu1 serial fallback: got %v, want CAR315KA0803B90-CPU-1", cpu1.SerialNumber)
|
t.Errorf("cpu1 serial should stay nil without source data, got %v", cpu1.SerialNumber)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseCPUs_unpopulated_skipped(t *testing.T) {
|
func TestParseCPUs_unpopulated_skipped(t *testing.T) {
|
||||||
out := mustReadFile(t, "testdata/dmidecode_type4_disabled.txt")
|
out := mustReadFile(t, "testdata/dmidecode_type4_disabled.txt")
|
||||||
cpus := parseCPUs(out, "BOARD-001")
|
cpus := parseCPUs(out)
|
||||||
|
|
||||||
if len(cpus) != 1 {
|
if len(cpus) != 1 {
|
||||||
t.Fatalf("expected 1 CPU (unpopulated skipped), got %d", len(cpus))
|
t.Fatalf("expected 1 CPU (unpopulated skipped), got %d", len(cpus))
|
||||||
@@ -63,18 +64,51 @@ func TestParseCPUs_unpopulated_skipped(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCollectCPUsSetsFirmwareFromMicrocode(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
origBase := cpuSysBaseDir
|
||||||
|
cpuSysBaseDir = tmp
|
||||||
|
t.Cleanup(func() { cpuSysBaseDir = origBase })
|
||||||
|
|
||||||
|
if err := os.MkdirAll(filepath.Join(tmp, "cpu0", "microcode"), 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir microcode dir: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(tmp, "cpu0", "microcode", "version"), []byte("0x2b000643\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("write microcode version: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
origRun := execDmidecode
|
||||||
|
execDmidecode = func(typeNum string) (string, error) {
|
||||||
|
if typeNum != "4" {
|
||||||
|
t.Fatalf("unexpected dmidecode type: %s", typeNum)
|
||||||
|
}
|
||||||
|
return mustReadFile(t, "testdata/dmidecode_type4.txt"), nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { execDmidecode = origRun })
|
||||||
|
|
||||||
|
cpus := collectCPUs()
|
||||||
|
if len(cpus) != 2 {
|
||||||
|
t.Fatalf("expected 2 CPUs, got %d", len(cpus))
|
||||||
|
}
|
||||||
|
for i, cpu := range cpus {
|
||||||
|
if cpu.Firmware == nil || *cpu.Firmware != "0x2b000643" {
|
||||||
|
t.Fatalf("cpu[%d] firmware=%v want microcode", i, cpu.Firmware)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseCPUStatus(t *testing.T) {
|
func TestParseCPUStatus(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
input string
|
input string
|
||||||
want string
|
want string
|
||||||
}{
|
}{
|
||||||
{"Populated, Enabled", "OK"},
|
{"Populated, Enabled", "OK"},
|
||||||
{"Populated, Disabled By User", "WARNING"},
|
{"Populated, Disabled By User", statusWarning},
|
||||||
{"Populated, Disabled By BIOS", "WARNING"},
|
{"Populated, Disabled By BIOS", statusWarning},
|
||||||
{"Unpopulated", "EMPTY"},
|
{"Unpopulated", statusEmpty},
|
||||||
{"Not Populated", "EMPTY"},
|
{"Not Populated", statusEmpty},
|
||||||
{"Unknown", "UNKNOWN"},
|
{"Unknown", statusUnknown},
|
||||||
{"", "UNKNOWN"},
|
{"", statusUnknown},
|
||||||
}
|
}
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
got := parseCPUStatus(tt.input)
|
got := parseCPUStatus(tt.input)
|
||||||
|
|||||||
88
audit/internal/collector/finalize.go
Normal file
88
audit/internal/collector/finalize.go
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import "bee/audit/internal/schema"
|
||||||
|
|
||||||
|
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||||
|
snap.Memory = filterMemory(snap.Memory)
|
||||||
|
snap.Storage = filterStorage(snap.Storage)
|
||||||
|
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||||
|
|
||||||
|
setComponentStatusMetadata(snap, collectedAt)
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterMemory(dimms []schema.HardwareMemory) []schema.HardwareMemory {
|
||||||
|
out := make([]schema.HardwareMemory, 0, len(dimms))
|
||||||
|
for _, dimm := range dimms {
|
||||||
|
if dimm.Present != nil && !*dimm.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if dimm.Status != nil && *dimm.Status == statusEmpty {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if dimm.SerialNumber == nil || *dimm.SerialNumber == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, dimm)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||||
|
out := make([]schema.HardwareStorage, 0, len(disks))
|
||||||
|
for _, disk := range disks {
|
||||||
|
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, disk)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||||
|
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||||
|
for _, psu := range psus {
|
||||||
|
hasIdentity := false
|
||||||
|
switch {
|
||||||
|
case psu.SerialNumber != nil && *psu.SerialNumber != "":
|
||||||
|
hasIdentity = true
|
||||||
|
case psu.Slot != nil && *psu.Slot != "":
|
||||||
|
hasIdentity = true
|
||||||
|
case psu.Model != nil && *psu.Model != "":
|
||||||
|
hasIdentity = true
|
||||||
|
case psu.Vendor != nil && *psu.Vendor != "":
|
||||||
|
hasIdentity = true
|
||||||
|
}
|
||||||
|
if !hasIdentity {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, psu)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func setComponentStatusMetadata(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||||
|
for i := range snap.CPUs {
|
||||||
|
setStatusCheckedAt(&snap.CPUs[i].HardwareComponentStatus, collectedAt)
|
||||||
|
}
|
||||||
|
for i := range snap.Memory {
|
||||||
|
setStatusCheckedAt(&snap.Memory[i].HardwareComponentStatus, collectedAt)
|
||||||
|
}
|
||||||
|
for i := range snap.Storage {
|
||||||
|
setStatusCheckedAt(&snap.Storage[i].HardwareComponentStatus, collectedAt)
|
||||||
|
}
|
||||||
|
for i := range snap.PCIeDevices {
|
||||||
|
setStatusCheckedAt(&snap.PCIeDevices[i].HardwareComponentStatus, collectedAt)
|
||||||
|
}
|
||||||
|
for i := range snap.PowerSupplies {
|
||||||
|
setStatusCheckedAt(&snap.PowerSupplies[i].HardwareComponentStatus, collectedAt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func setStatusCheckedAt(status *schema.HardwareComponentStatus, collectedAt string) {
|
||||||
|
if status == nil || status.Status == nil || *status.Status == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if status.StatusCheckedAt == nil {
|
||||||
|
status.StatusCheckedAt = &collectedAt
|
||||||
|
}
|
||||||
|
}
|
||||||
80
audit/internal/collector/finalize_test.go
Normal file
80
audit/internal/collector/finalize_test.go
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||||
|
collectedAt := "2026-03-15T12:00:00Z"
|
||||||
|
present := true
|
||||||
|
status := statusOK
|
||||||
|
serial := "SN-1"
|
||||||
|
|
||||||
|
snap := schema.HardwareSnapshot{
|
||||||
|
Memory: []schema.HardwareMemory{
|
||||||
|
{Present: &present, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
},
|
||||||
|
Storage: []schema.HardwareStorage{
|
||||||
|
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
},
|
||||||
|
PowerSupplies: []schema.HardwarePowerSupply{
|
||||||
|
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
finalizeSnapshot(&snap, collectedAt)
|
||||||
|
|
||||||
|
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
||||||
|
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
||||||
|
}
|
||||||
|
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||||
|
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
||||||
|
}
|
||||||
|
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
||||||
|
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFinalizeSnapshotPreservesDuplicateSerials(t *testing.T) {
|
||||||
|
collectedAt := "2026-03-15T12:00:00Z"
|
||||||
|
status := statusOK
|
||||||
|
model := "Device"
|
||||||
|
serial := "DUPLICATE"
|
||||||
|
|
||||||
|
snap := schema.HardwareSnapshot{
|
||||||
|
Storage: []schema.HardwareStorage{
|
||||||
|
{Model: &model, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{Model: &model, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
finalizeSnapshot(&snap, collectedAt)
|
||||||
|
|
||||||
|
if got := *snap.Storage[0].SerialNumber; got != serial {
|
||||||
|
t.Fatalf("first serial changed: %q", got)
|
||||||
|
}
|
||||||
|
if got := *snap.Storage[1].SerialNumber; got != serial {
|
||||||
|
t.Fatalf("duplicate serial should stay unchanged: %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFilterPSUsKeepsSlotOnlyEntries(t *testing.T) {
|
||||||
|
slot := "0"
|
||||||
|
status := statusOK
|
||||||
|
|
||||||
|
got := filterPSUs([]schema.HardwarePowerSupply{
|
||||||
|
{Slot: &slot, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(got) != 1 {
|
||||||
|
t.Fatalf("len(got)=%d want 1", len(got))
|
||||||
|
}
|
||||||
|
if got[0].Slot == nil || *got[0].Slot != "0" {
|
||||||
|
t.Fatalf("unexpected kept PSU: %+v", got[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -47,12 +47,12 @@ func parseMemorySection(fields map[string]string) schema.HardwareMemory {
|
|||||||
dimm.Present = &present
|
dimm.Present = &present
|
||||||
|
|
||||||
if !present {
|
if !present {
|
||||||
status := "EMPTY"
|
status := statusEmpty
|
||||||
dimm.Status = &status
|
dimm.Status = &status
|
||||||
return dimm
|
return dimm
|
||||||
}
|
}
|
||||||
|
|
||||||
status := "OK"
|
status := statusOK
|
||||||
dimm.Status = &status
|
dimm.Status = &status
|
||||||
|
|
||||||
if mb := parseMemorySizeMB(rawSize); mb > 0 {
|
if mb := parseMemorySizeMB(rawSize); mb > 0 {
|
||||||
|
|||||||
203
audit/internal/collector/memory_telemetry.go
Normal file
203
audit/internal/collector/memory_telemetry.go
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var edacBaseDir = "/sys/devices/system/edac/mc"
|
||||||
|
|
||||||
|
type edacDIMMStats struct {
|
||||||
|
Label string
|
||||||
|
CECount *int64
|
||||||
|
UECount *int64
|
||||||
|
}
|
||||||
|
|
||||||
|
func enrichMemoryWithTelemetry(dimms []schema.HardwareMemory, doc sensorsDoc) []schema.HardwareMemory {
|
||||||
|
if len(dimms) == 0 {
|
||||||
|
return dimms
|
||||||
|
}
|
||||||
|
|
||||||
|
tempByLabel := memoryTempsFromSensors(doc)
|
||||||
|
stats := readEDACStats()
|
||||||
|
|
||||||
|
for i := range dimms {
|
||||||
|
labelKeys := dimmMatchKeys(dimms[i].Slot, dimms[i].Location)
|
||||||
|
|
||||||
|
for _, key := range labelKeys {
|
||||||
|
if temp, ok := tempByLabel[key]; ok {
|
||||||
|
dimms[i].TemperatureC = &temp
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, key := range labelKeys {
|
||||||
|
if stat, ok := stats[key]; ok {
|
||||||
|
if stat.CECount != nil {
|
||||||
|
dimms[i].CorrectableECCErrorCount = stat.CECount
|
||||||
|
}
|
||||||
|
if stat.UECount != nil {
|
||||||
|
dimms[i].UncorrectableECCErrorCount = stat.UECount
|
||||||
|
}
|
||||||
|
if stat.UECount != nil && *stat.UECount > 0 {
|
||||||
|
dimms[i].DataLossDetected = boolPtr(true)
|
||||||
|
status := statusCritical
|
||||||
|
dimms[i].Status = &status
|
||||||
|
if dimms[i].ErrorDescription == nil {
|
||||||
|
dimms[i].ErrorDescription = stringPtr("EDAC reports uncorrectable ECC errors")
|
||||||
|
}
|
||||||
|
} else if stat.CECount != nil && *stat.CECount > 0 && (dimms[i].Status == nil || *dimms[i].Status == statusOK) {
|
||||||
|
status := statusWarning
|
||||||
|
dimms[i].Status = &status
|
||||||
|
if dimms[i].ErrorDescription == nil {
|
||||||
|
dimms[i].ErrorDescription = stringPtr("EDAC reports correctable ECC errors")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return dimms
|
||||||
|
}
|
||||||
|
|
||||||
|
func memoryTempsFromSensors(doc sensorsDoc) map[string]float64 {
|
||||||
|
out := map[string]float64{}
|
||||||
|
if len(doc) == 0 {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
for chip, features := range doc {
|
||||||
|
for featureName, raw := range features {
|
||||||
|
feature, ok := raw.(map[string]any)
|
||||||
|
if !ok || classifySensorFeature(feature) != "temp" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !isLikelyMemoryTemp(chip, featureName) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
temp, ok := firstFeatureFloat(feature, "_input")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := canonicalLabel(featureName)
|
||||||
|
if key == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, exists := out[key]; !exists {
|
||||||
|
out[key] = temp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func readEDACStats() map[string]edacDIMMStats {
|
||||||
|
out := map[string]edacDIMMStats{}
|
||||||
|
mcDirs, err := filepath.Glob(filepath.Join(edacBaseDir, "mc*"))
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
sort.Strings(mcDirs)
|
||||||
|
for _, mcDir := range mcDirs {
|
||||||
|
dimmDirs, err := filepath.Glob(filepath.Join(mcDir, "dimm*"))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sort.Strings(dimmDirs)
|
||||||
|
for _, dimmDir := range dimmDirs {
|
||||||
|
stat, ok := readEDACDIMMStats(dimmDir)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := canonicalLabel(stat.Label)
|
||||||
|
if key == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[key] = stat
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func readEDACDIMMStats(dimmDir string) (edacDIMMStats, bool) {
|
||||||
|
labelBytes, err := os.ReadFile(filepath.Join(dimmDir, "dimm_label"))
|
||||||
|
if err != nil {
|
||||||
|
labelBytes, err = os.ReadFile(filepath.Join(dimmDir, "label"))
|
||||||
|
if err != nil {
|
||||||
|
return edacDIMMStats{}, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(string(labelBytes))
|
||||||
|
if label == "" {
|
||||||
|
return edacDIMMStats{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
stat := edacDIMMStats{Label: label}
|
||||||
|
if value, ok := readEDACCount(dimmDir, []string{"dimm_ce_count", "ce_count"}); ok {
|
||||||
|
stat.CECount = &value
|
||||||
|
}
|
||||||
|
if value, ok := readEDACCount(dimmDir, []string{"dimm_ue_count", "ue_count"}); ok {
|
||||||
|
stat.UECount = &value
|
||||||
|
}
|
||||||
|
return stat, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func readEDACCount(dir string, names []string) (int64, bool) {
|
||||||
|
for _, name := range names {
|
||||||
|
raw, err := os.ReadFile(filepath.Join(dir, name))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
||||||
|
if err == nil && value >= 0 {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func dimmMatchKeys(slot, location *string) []string {
|
||||||
|
var out []string
|
||||||
|
add := func(value *string) {
|
||||||
|
key := canonicalLabel(derefString(value))
|
||||||
|
if key == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, existing := range out {
|
||||||
|
if existing == key {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = append(out, key)
|
||||||
|
}
|
||||||
|
add(slot)
|
||||||
|
add(location)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func canonicalLabel(value string) string {
|
||||||
|
value = strings.ToUpper(strings.TrimSpace(value))
|
||||||
|
if value == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
for _, r := range value {
|
||||||
|
if (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') {
|
||||||
|
b.WriteRune(r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyMemoryTemp(chip, feature string) bool {
|
||||||
|
value := strings.ToLower(chip + " " + feature)
|
||||||
|
return strings.Contains(value, "dimm") || strings.Contains(value, "sodimm")
|
||||||
|
}
|
||||||
|
|
||||||
|
func boolPtr(value bool) *bool {
|
||||||
|
return &value
|
||||||
|
}
|
||||||
61
audit/internal/collector/memory_telemetry_test.go
Normal file
61
audit/internal/collector/memory_telemetry_test.go
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEnrichMemoryWithTelemetry(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldBase := edacBaseDir
|
||||||
|
edacBaseDir = tmp
|
||||||
|
t.Cleanup(func() { edacBaseDir = oldBase })
|
||||||
|
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_label"), "CPU0_DIMM_A1\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_ce_count"), "7\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_ue_count"), "0\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_label"), "CPU1_DIMM_B2\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_ce_count"), "0\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_ue_count"), "2\n")
|
||||||
|
|
||||||
|
doc := sensorsDoc{
|
||||||
|
"jc42-i2c-0-18": {
|
||||||
|
"CPU0 DIMM A1": map[string]any{"temp1_input": 43.0},
|
||||||
|
"CPU1 DIMM B2": map[string]any{"temp2_input": 46.0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
status := statusOK
|
||||||
|
slotA := "CPU0_DIMM_A1"
|
||||||
|
slotB := "CPU1_DIMM_B2"
|
||||||
|
dimms := []schema.HardwareMemory{
|
||||||
|
{Slot: &slotA, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{Slot: &slotB, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := enrichMemoryWithTelemetry(dimms, doc)
|
||||||
|
|
||||||
|
if got[0].TemperatureC == nil || *got[0].TemperatureC != 43.0 {
|
||||||
|
t.Fatalf("dimm0 temperature mismatch: %#v", got[0].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[0].CorrectableECCErrorCount == nil || *got[0].CorrectableECCErrorCount != 7 {
|
||||||
|
t.Fatalf("dimm0 ce mismatch: %#v", got[0].CorrectableECCErrorCount)
|
||||||
|
}
|
||||||
|
if got[0].Status == nil || *got[0].Status != statusWarning {
|
||||||
|
t.Fatalf("dimm0 status mismatch: %#v", got[0].Status)
|
||||||
|
}
|
||||||
|
if got[1].TemperatureC == nil || *got[1].TemperatureC != 46.0 {
|
||||||
|
t.Fatalf("dimm1 temperature mismatch: %#v", got[1].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[1].UncorrectableECCErrorCount == nil || *got[1].UncorrectableECCErrorCount != 2 {
|
||||||
|
t.Fatalf("dimm1 ue mismatch: %#v", got[1].UncorrectableECCErrorCount)
|
||||||
|
}
|
||||||
|
if got[1].Status == nil || *got[1].Status != statusCritical {
|
||||||
|
t.Fatalf("dimm1 status mismatch: %#v", got[1].Status)
|
||||||
|
}
|
||||||
|
if got[1].DataLossDetected == nil || !*got[1].DataLossDetected {
|
||||||
|
t.Fatalf("dimm1 data_loss_detected mismatch: %#v", got[1].DataLossDetected)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -18,17 +18,13 @@ var (
|
|||||||
}
|
}
|
||||||
return string(out), nil
|
return string(out), nil
|
||||||
}
|
}
|
||||||
readNetStatFile = func(iface, key string) (int64, error) {
|
readNetAddressFile = func(iface string) (string, error) {
|
||||||
path := filepath.Join("/sys/class/net", iface, "statistics", key)
|
path := filepath.Join("/sys/class/net", iface, "address")
|
||||||
raw, err := os.ReadFile(path)
|
raw, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return "", err
|
||||||
}
|
}
|
||||||
v, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
return strings.TrimSpace(string(raw)), nil
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
return v, nil
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -47,6 +43,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
iface := ifaces[0]
|
iface := ifaces[0]
|
||||||
|
devs[i].MacAddresses = collectInterfaceMACs(ifaces)
|
||||||
|
if devs[i].SerialNumber == nil {
|
||||||
|
if serial := queryPCIDeviceSerial(bdf); serial != "" {
|
||||||
|
devs[i].SerialNumber = &serial
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if devs[i].Firmware == nil {
|
if devs[i].Firmware == nil {
|
||||||
if out, err := ethtoolInfoQuery(iface); err == nil {
|
if out, err := ethtoolInfoQuery(iface); err == nil {
|
||||||
@@ -56,16 +58,13 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if devs[i].Telemetry == nil {
|
|
||||||
devs[i].Telemetry = map[string]any{}
|
|
||||||
}
|
|
||||||
injectNICPacketStats(devs[i].Telemetry, iface)
|
|
||||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||||
injectSFPDOMTelemetry(devs[i].Telemetry, out)
|
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||||
|
enriched++
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if len(devs[i].Telemetry) == 0 {
|
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||||
devs[i].Telemetry = nil
|
|
||||||
} else {
|
|
||||||
enriched++
|
enriched++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -77,31 +76,32 @@ func isNICDevice(dev schema.HardwarePCIeDevice) bool {
|
|||||||
if dev.DeviceClass == nil {
|
if dev.DeviceClass == nil {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass))
|
c := strings.TrimSpace(*dev.DeviceClass)
|
||||||
return strings.Contains(c, "ethernet controller") ||
|
return isNICClass(c) || strings.EqualFold(c, "FibreChannelController")
|
||||||
strings.Contains(c, "network controller") ||
|
|
||||||
strings.Contains(c, "infiniband controller")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func injectNICPacketStats(dst map[string]any, iface string) {
|
func collectInterfaceMACs(ifaces []string) []string {
|
||||||
for _, key := range []string{"rx_packets", "tx_packets", "rx_errors", "tx_errors"} {
|
seen := map[string]struct{}{}
|
||||||
if v, err := readNetStatFile(iface, key); err == nil {
|
var out []string
|
||||||
dst[key] = v
|
for _, iface := range ifaces {
|
||||||
|
mac, err := readNetAddressFile(iface)
|
||||||
|
if err != nil || mac == "" {
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
mac = strings.ToLower(strings.TrimSpace(mac))
|
||||||
|
if _, ok := seen[mac]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[mac] = struct{}{}
|
||||||
|
out = append(out, mac)
|
||||||
}
|
}
|
||||||
}
|
return out
|
||||||
|
|
||||||
func injectSFPDOMTelemetry(dst map[string]any, raw string) {
|
|
||||||
parsed := parseSFPDOM(raw)
|
|
||||||
for k, v := range parsed {
|
|
||||||
dst[k] = v
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var floatRe = regexp.MustCompile(`[-+]?[0-9]*\.?[0-9]+`)
|
var floatRe = regexp.MustCompile(`[-+]?[0-9]*\.?[0-9]+`)
|
||||||
|
|
||||||
func parseSFPDOM(raw string) map[string]any {
|
func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||||
out := map[string]any{}
|
var changed bool
|
||||||
for _, line := range strings.Split(raw, "\n") {
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
trimmed := strings.TrimSpace(line)
|
trimmed := strings.TrimSpace(line)
|
||||||
if trimmed == "" {
|
if trimmed == "" {
|
||||||
@@ -117,26 +117,55 @@ func parseSFPDOM(raw string) map[string]any {
|
|||||||
switch {
|
switch {
|
||||||
case strings.Contains(key, "module temperature"):
|
case strings.Contains(key, "module temperature"):
|
||||||
if f, ok := firstFloat(val); ok {
|
if f, ok := firstFloat(val); ok {
|
||||||
out["sfp_temperature_c"] = f
|
dev.SFPTemperatureC = &f
|
||||||
|
changed = true
|
||||||
}
|
}
|
||||||
case strings.Contains(key, "laser output power"):
|
case strings.Contains(key, "laser output power"):
|
||||||
if f, ok := dbmValue(val); ok {
|
if f, ok := dbmValue(val); ok {
|
||||||
out["sfp_tx_power_dbm"] = f
|
dev.SFPTXPowerDBM = &f
|
||||||
|
changed = true
|
||||||
}
|
}
|
||||||
case strings.Contains(key, "receiver signal"):
|
case strings.Contains(key, "receiver signal"):
|
||||||
if f, ok := dbmValue(val); ok {
|
if f, ok := dbmValue(val); ok {
|
||||||
out["sfp_rx_power_dbm"] = f
|
dev.SFPRXPowerDBM = &f
|
||||||
|
changed = true
|
||||||
}
|
}
|
||||||
case strings.Contains(key, "module voltage"):
|
case strings.Contains(key, "module voltage"):
|
||||||
if f, ok := firstFloat(val); ok {
|
if f, ok := firstFloat(val); ok {
|
||||||
out["sfp_voltage_v"] = f
|
dev.SFPVoltageV = &f
|
||||||
|
changed = true
|
||||||
}
|
}
|
||||||
case strings.Contains(key, "laser bias current"):
|
case strings.Contains(key, "laser bias current"):
|
||||||
if f, ok := firstFloat(val); ok {
|
if f, ok := firstFloat(val); ok {
|
||||||
out["sfp_bias_ma"] = f
|
dev.SFPBiasMA = &f
|
||||||
|
changed = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return changed
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSFPDOM(raw string) map[string]any {
|
||||||
|
dev := schema.HardwarePCIeDevice{}
|
||||||
|
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||||
|
return map[string]any{}
|
||||||
|
}
|
||||||
|
out := map[string]any{}
|
||||||
|
if dev.SFPTemperatureC != nil {
|
||||||
|
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||||
|
}
|
||||||
|
if dev.SFPTXPowerDBM != nil {
|
||||||
|
out["sfp_tx_power_dbm"] = *dev.SFPTXPowerDBM
|
||||||
|
}
|
||||||
|
if dev.SFPRXPowerDBM != nil {
|
||||||
|
out["sfp_rx_power_dbm"] = *dev.SFPRXPowerDBM
|
||||||
|
}
|
||||||
|
if dev.SFPVoltageV != nil {
|
||||||
|
out["sfp_voltage_v"] = *dev.SFPVoltageV
|
||||||
|
}
|
||||||
|
if dev.SFPBiasMA != nil {
|
||||||
|
out["sfp_bias_ma"] = *dev.SFPBiasMA
|
||||||
|
}
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestParseSFPDOM(t *testing.T) {
|
func TestParseSFPDOM(t *testing.T) {
|
||||||
raw := `
|
raw := `
|
||||||
@@ -29,6 +33,74 @@ func TestParseSFPDOM(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseLSPCIDetailSerial(t *testing.T) {
|
||||||
|
raw := `
|
||||||
|
05:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
|
||||||
|
Serial number: NIC-SN-12345
|
||||||
|
`
|
||||||
|
if got := parseLSPCIDetailSerial(raw); got != "NIC-SN-12345" {
|
||||||
|
t.Fatalf("serial=%q want %q", got, "NIC-SN-12345")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParsePCIVPDSerial(t *testing.T) {
|
||||||
|
raw := []byte{0x82, 0x05, 0x00, 'M', 'L', 'X', '5', 0x90, 0x08, 0x00, 'S', 'N', 0x08, 'M', 'T', '1', '2', '3', '4', '5', '6'}
|
||||||
|
if got := parsePCIVPDSerial(raw); got != "MT123456" {
|
||||||
|
t.Fatalf("serial=%q want %q", got, "MT123456")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||||
|
origDetail := queryPCILSPCIDetail
|
||||||
|
origVPD := readPCIVPDFile
|
||||||
|
origIfaces := netIfacesByBDF
|
||||||
|
origReadMAC := readNetAddressFile
|
||||||
|
origEth := ethtoolInfoQuery
|
||||||
|
origModule := ethtoolModuleQuery
|
||||||
|
t.Cleanup(func() {
|
||||||
|
queryPCILSPCIDetail = origDetail
|
||||||
|
readPCIVPDFile = origVPD
|
||||||
|
netIfacesByBDF = origIfaces
|
||||||
|
readNetAddressFile = origReadMAC
|
||||||
|
ethtoolInfoQuery = origEth
|
||||||
|
ethtoolModuleQuery = origModule
|
||||||
|
})
|
||||||
|
|
||||||
|
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||||
|
if bdf != "0000:18:00.0" {
|
||||||
|
t.Fatalf("unexpected bdf: %s", bdf)
|
||||||
|
}
|
||||||
|
return "Serial number: NIC-SN-98765\n", nil
|
||||||
|
}
|
||||||
|
readPCIVPDFile = func(string) ([]byte, error) {
|
||||||
|
return nil, fmt.Errorf("no vpd needed")
|
||||||
|
}
|
||||||
|
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||||
|
readNetAddressFile = func(iface string) (string, error) {
|
||||||
|
if iface != "eth0" {
|
||||||
|
t.Fatalf("unexpected iface: %s", iface)
|
||||||
|
}
|
||||||
|
return "aa:bb:cc:dd:ee:ff", nil
|
||||||
|
}
|
||||||
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
|
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||||
|
|
||||||
|
class := "EthernetController"
|
||||||
|
bdf := "0000:18:00.0"
|
||||||
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
|
DeviceClass: &class,
|
||||||
|
BDF: &bdf,
|
||||||
|
}}
|
||||||
|
|
||||||
|
out := enrichPCIeWithNICTelemetry(devs)
|
||||||
|
if out[0].SerialNumber == nil || *out[0].SerialNumber != "NIC-SN-98765" {
|
||||||
|
t.Fatalf("serial=%v want NIC-SN-98765", out[0].SerialNumber)
|
||||||
|
}
|
||||||
|
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||||
|
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestDBMValue(t *testing.T) {
|
func TestDBMValue(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
in string
|
in string
|
||||||
|
|||||||
@@ -24,18 +24,29 @@ type nvidiaGPUInfo struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||||
// If the driver/tool is unavailable, NVIDIA devices get UNKNOWN status and
|
// If the driver/tool is unavailable, NVIDIA devices get Unknown status.
|
||||||
// a stable serial fallback based on board serial + slot.
|
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||||
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
|
if !hasNVIDIADevices(devs) {
|
||||||
|
return devs
|
||||||
|
}
|
||||||
gpuByBDF, err := queryNVIDIAGPUs()
|
gpuByBDF, err := queryNVIDIAGPUs()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Info("nvidia: enrichment skipped", "err", err)
|
slog.Info("nvidia: enrichment skipped", "err", err)
|
||||||
return enrichPCIeWithNVIDIAData(devs, nil, boardSerial, false)
|
return enrichPCIeWithNVIDIAData(devs, nil, false)
|
||||||
}
|
}
|
||||||
return enrichPCIeWithNVIDIAData(devs, gpuByBDF, boardSerial, true)
|
return enrichPCIeWithNVIDIAData(devs, gpuByBDF, true)
|
||||||
}
|
}
|
||||||
|
|
||||||
func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[string]nvidiaGPUInfo, boardSerial string, driverLoaded bool) []schema.HardwarePCIeDevice {
|
func hasNVIDIADevices(devs []schema.HardwarePCIeDevice) bool {
|
||||||
|
for _, dev := range devs {
|
||||||
|
if isNVIDIADevice(dev) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[string]nvidiaGPUInfo, driverLoaded bool) []schema.HardwarePCIeDevice {
|
||||||
enriched := 0
|
enriched := 0
|
||||||
for i := range devs {
|
for i := range devs {
|
||||||
if !isNVIDIADevice(devs[i]) {
|
if !isNVIDIADevice(devs[i]) {
|
||||||
@@ -43,7 +54,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !driverLoaded {
|
if !driverLoaded {
|
||||||
setPCIeFallback(&devs[i], boardSerial)
|
setPCIeFallback(&devs[i])
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,22 +64,21 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
}
|
}
|
||||||
info, ok := gpuByBDF[bdf]
|
info, ok := gpuByBDF[bdf]
|
||||||
if !ok {
|
if !ok {
|
||||||
setPCIeFallback(&devs[i], boardSerial)
|
setPCIeFallback(&devs[i])
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||||
devs[i].SerialNumber = &v
|
devs[i].SerialNumber = &v
|
||||||
} else {
|
|
||||||
setPCIeFallbackSerial(&devs[i], boardSerial)
|
|
||||||
}
|
}
|
||||||
if v := strings.TrimSpace(info.VBIOS); v != "" {
|
if v := strings.TrimSpace(info.VBIOS); v != "" {
|
||||||
devs[i].Firmware = &v
|
devs[i].Firmware = &v
|
||||||
}
|
}
|
||||||
|
|
||||||
status := "OK"
|
status := statusOK
|
||||||
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
|
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
|
||||||
status = "WARNING"
|
status = statusWarning
|
||||||
|
devs[i].ErrorDescription = stringPtr("GPU reports uncorrected ECC errors")
|
||||||
}
|
}
|
||||||
devs[i].Status = &status
|
devs[i].Status = &status
|
||||||
injectNVIDIATelemetry(&devs[i], info)
|
injectNVIDIATelemetry(&devs[i], info)
|
||||||
@@ -200,46 +210,25 @@ func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||||
setPCIeFallbackSerial(dev, boardSerial)
|
status := statusUnknown
|
||||||
status := "UNKNOWN"
|
|
||||||
dev.Status = &status
|
dev.Status = &status
|
||||||
}
|
}
|
||||||
|
|
||||||
func setPCIeFallbackSerial(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
|
||||||
if strings.TrimSpace(boardSerial) == "" || dev.SerialNumber != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
slot := "unknown"
|
|
||||||
if dev.BDF != nil && strings.TrimSpace(*dev.BDF) != "" {
|
|
||||||
slot = strings.TrimSpace(*dev.BDF)
|
|
||||||
} else if dev.Slot != nil && strings.TrimSpace(*dev.Slot) != "" {
|
|
||||||
slot = strings.TrimSpace(*dev.Slot)
|
|
||||||
}
|
|
||||||
fb := fmt.Sprintf("%s-PCIE-%s", boardSerial, slot)
|
|
||||||
dev.SerialNumber = &fb
|
|
||||||
}
|
|
||||||
|
|
||||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||||
if dev.Telemetry == nil {
|
|
||||||
dev.Telemetry = map[string]any{}
|
|
||||||
}
|
|
||||||
if info.TemperatureC != nil {
|
if info.TemperatureC != nil {
|
||||||
dev.Telemetry["temperature_c"] = *info.TemperatureC
|
dev.TemperatureC = info.TemperatureC
|
||||||
}
|
}
|
||||||
if info.PowerW != nil {
|
if info.PowerW != nil {
|
||||||
dev.Telemetry["power_w"] = *info.PowerW
|
dev.PowerW = info.PowerW
|
||||||
}
|
}
|
||||||
if info.ECCUncorrected != nil {
|
if info.ECCUncorrected != nil {
|
||||||
dev.Telemetry["ecc_uncorrected_total"] = *info.ECCUncorrected
|
dev.ECCUncorrectedTotal = info.ECCUncorrected
|
||||||
}
|
}
|
||||||
if info.ECCCorrected != nil {
|
if info.ECCCorrected != nil {
|
||||||
dev.Telemetry["ecc_corrected_total"] = *info.ECCCorrected
|
dev.ECCCorrectedTotal = info.ECCCorrected
|
||||||
}
|
}
|
||||||
if info.HWSlowdown != nil {
|
if info.HWSlowdown != nil {
|
||||||
dev.Telemetry["hw_slowdown_active"] = *info.HWSlowdown
|
dev.HWSlowdown = info.HWSlowdown
|
||||||
}
|
|
||||||
if len(dev.Telemetry) == 0 {
|
|
||||||
dev.Telemetry = nil
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -54,10 +54,10 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
|||||||
status := "OK"
|
status := "OK"
|
||||||
devices := []schema.HardwarePCIeDevice{
|
devices := []schema.HardwarePCIeDevice{
|
||||||
{
|
{
|
||||||
VendorID: &vendorID,
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
BDF: &bdf,
|
VendorID: &vendorID,
|
||||||
Manufacturer: &manufacturer,
|
BDF: &bdf,
|
||||||
Status: &status,
|
Manufacturer: &manufacturer,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -73,21 +73,21 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
out := enrichPCIeWithNVIDIAData(devices, byBDF, "BOARD-001", true)
|
out := enrichPCIeWithNVIDIAData(devices, byBDF, true)
|
||||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-ABC" {
|
if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-ABC" {
|
||||||
t.Fatalf("serial: got %v", out[0].SerialNumber)
|
t.Fatalf("serial: got %v", out[0].SerialNumber)
|
||||||
}
|
}
|
||||||
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
||||||
t.Fatalf("firmware: got %v", out[0].Firmware)
|
t.Fatalf("firmware: got %v", out[0].Firmware)
|
||||||
}
|
}
|
||||||
if out[0].Status == nil || *out[0].Status != "WARNING" {
|
if out[0].Status == nil || *out[0].Status != statusWarning {
|
||||||
t.Fatalf("status: got %v", out[0].Status)
|
t.Fatalf("status: got %v", out[0].Status)
|
||||||
}
|
}
|
||||||
if out[0].Telemetry == nil {
|
if out[0].ECCUncorrectedTotal == nil || *out[0].ECCUncorrectedTotal != 2 {
|
||||||
t.Fatal("expected telemetry")
|
t.Fatalf("ecc_uncorrected_total: got %#v", out[0].ECCUncorrectedTotal)
|
||||||
}
|
}
|
||||||
if got, ok := out[0].Telemetry["ecc_uncorrected_total"].(int64); !ok || got != 2 {
|
if out[0].TemperatureC == nil || *out[0].TemperatureC != 55.5 {
|
||||||
t.Fatalf("ecc_uncorrected_total: got %#v", out[0].Telemetry["ecc_uncorrected_total"])
|
t.Fatalf("temperature_c: got %#v", out[0].TemperatureC)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -103,11 +103,11 @@ func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
out := enrichPCIeWithNVIDIAData(devices, nil, "BOARD-123", false)
|
out := enrichPCIeWithNVIDIAData(devices, nil, false)
|
||||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "BOARD-123-PCIE-0000:17:00.0" {
|
if out[0].SerialNumber != nil {
|
||||||
t.Fatalf("fallback serial: got %v", out[0].SerialNumber)
|
t.Fatalf("serial should stay nil without source data, got %v", out[0].SerialNumber)
|
||||||
}
|
}
|
||||||
if out[0].Status == nil || *out[0].Status != "UNKNOWN" {
|
if out[0].Status == nil || *out[0].Status != statusUnknown {
|
||||||
t.Fatalf("fallback status: got %v", out[0].Status)
|
t.Fatalf("fallback status: got %v", out[0].Status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
|
|||||||
val := strings.TrimSpace(line[idx+2:])
|
val := strings.TrimSpace(line[idx+2:])
|
||||||
fields[key] = val
|
fields[key] = val
|
||||||
}
|
}
|
||||||
if !shouldIncludePCIeDevice(fields["Class"]) {
|
if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
dev := parseLspciDevice(fields)
|
dev := parseLspciDevice(fields)
|
||||||
@@ -46,8 +46,10 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
|
|||||||
return devs
|
return devs
|
||||||
}
|
}
|
||||||
|
|
||||||
func shouldIncludePCIeDevice(class string) bool {
|
func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||||
c := strings.ToLower(strings.TrimSpace(class))
|
c := strings.ToLower(strings.TrimSpace(class))
|
||||||
|
v := strings.ToLower(strings.TrimSpace(vendor))
|
||||||
|
d := strings.ToLower(strings.TrimSpace(device))
|
||||||
if c == "" {
|
if c == "" {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
@@ -57,6 +59,8 @@ func shouldIncludePCIeDevice(class string) bool {
|
|||||||
"host bridge",
|
"host bridge",
|
||||||
"isa bridge",
|
"isa bridge",
|
||||||
"pci bridge",
|
"pci bridge",
|
||||||
|
"performance counter",
|
||||||
|
"performance counters",
|
||||||
"ram memory",
|
"ram memory",
|
||||||
"system peripheral",
|
"system peripheral",
|
||||||
"communication controller",
|
"communication controller",
|
||||||
@@ -66,12 +70,28 @@ func shouldIncludePCIeDevice(class string) bool {
|
|||||||
"audio device",
|
"audio device",
|
||||||
"serial bus controller",
|
"serial bus controller",
|
||||||
"unassigned class",
|
"unassigned class",
|
||||||
|
"non-essential instrumentation",
|
||||||
}
|
}
|
||||||
for _, bad := range excluded {
|
for _, bad := range excluded {
|
||||||
if strings.Contains(c, bad) {
|
if strings.Contains(c, bad) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||||
|
internalAMDPatterns := []string{
|
||||||
|
"dummy function",
|
||||||
|
"reserved spp",
|
||||||
|
"ptdma",
|
||||||
|
"cryptographic coprocessor pspcpp",
|
||||||
|
"pspcpp",
|
||||||
|
}
|
||||||
|
for _, bad := range internalAMDPatterns {
|
||||||
|
if strings.Contains(d, bad) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -79,11 +99,12 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
dev := schema.HardwarePCIeDevice{}
|
dev := schema.HardwarePCIeDevice{}
|
||||||
present := true
|
present := true
|
||||||
dev.Present = &present
|
dev.Present = &present
|
||||||
status := "OK"
|
status := statusOK
|
||||||
dev.Status = &status
|
dev.Status = &status
|
||||||
|
|
||||||
// Slot is the BDF: "0000:00:02.0"
|
// Slot is the BDF: "0000:00:02.0"
|
||||||
if bdf := fields["Slot"]; bdf != "" {
|
if bdf := fields["Slot"]; bdf != "" {
|
||||||
|
dev.Slot = &bdf
|
||||||
dev.BDF = &bdf
|
dev.BDF = &bdf
|
||||||
// parse vendor_id and device_id from sysfs
|
// parse vendor_id and device_id from sysfs
|
||||||
vendorID, deviceID := readPCIIDs(bdf)
|
vendorID, deviceID := readPCIIDs(bdf)
|
||||||
@@ -93,10 +114,34 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
if deviceID != 0 {
|
if deviceID != 0 {
|
||||||
dev.DeviceID = &deviceID
|
dev.DeviceID = &deviceID
|
||||||
}
|
}
|
||||||
|
if numaNode, ok := readPCINumaNode(bdf); ok {
|
||||||
|
dev.NUMANode = &numaNode
|
||||||
|
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||||
|
dev.NUMANode = &numaNode
|
||||||
|
}
|
||||||
|
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||||
|
dev.LinkWidth = &width
|
||||||
|
}
|
||||||
|
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
|
||||||
|
dev.MaxLinkWidth = &width
|
||||||
|
}
|
||||||
|
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
|
||||||
|
linkSpeed := normalizePCILinkSpeed(speed)
|
||||||
|
if linkSpeed != "" {
|
||||||
|
dev.LinkSpeed = &linkSpeed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
|
||||||
|
linkSpeed := normalizePCILinkSpeed(speed)
|
||||||
|
if linkSpeed != "" {
|
||||||
|
dev.MaxLinkSpeed = &linkSpeed
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if v := fields["Class"]; v != "" {
|
if v := fields["Class"]; v != "" {
|
||||||
dev.DeviceClass = &v
|
class := mapPCIeDeviceClass(v)
|
||||||
|
dev.DeviceClass = &class
|
||||||
}
|
}
|
||||||
if v := fields["Vendor"]; v != "" {
|
if v := fields["Vendor"]; v != "" {
|
||||||
dev.Manufacturer = &v
|
dev.Manufacturer = &v
|
||||||
@@ -131,3 +176,67 @@ func readHexFile(path string) (int, error) {
|
|||||||
n, err := strconv.ParseInt(s, 16, 64)
|
n, err := strconv.ParseInt(s, 16, 64)
|
||||||
return int(n), err
|
return int(n), err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func readPCINumaNode(bdf string) (int, bool) {
|
||||||
|
value, ok := readPCIIntAttribute(bdf, "numa_node")
|
||||||
|
if !ok || value < 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func parsePCINumaNode(raw string) (int, bool) {
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
if raw == "" {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
value, err := strconv.Atoi(raw)
|
||||||
|
if err != nil || value < 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
|
||||||
|
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
value, err := strconv.Atoi(strings.TrimSpace(string(out)))
|
||||||
|
if err != nil || value < 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||||
|
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
||||||
|
if err != nil {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
value := strings.TrimSpace(string(out))
|
||||||
|
if value == "" {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizePCILinkSpeed(raw string) string {
|
||||||
|
raw = strings.TrimSpace(strings.ToLower(raw))
|
||||||
|
switch {
|
||||||
|
case strings.Contains(raw, "2.5"):
|
||||||
|
return "Gen1"
|
||||||
|
case strings.Contains(raw, "5.0"):
|
||||||
|
return "Gen2"
|
||||||
|
case strings.Contains(raw, "8.0"):
|
||||||
|
return "Gen3"
|
||||||
|
case strings.Contains(raw, "16.0"):
|
||||||
|
return "Gen4"
|
||||||
|
case strings.Contains(raw, "32.0"):
|
||||||
|
return "Gen5"
|
||||||
|
case strings.Contains(raw, "64.0"):
|
||||||
|
return "Gen6"
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,41 +1,126 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestShouldIncludePCIeDevice(t *testing.T) {
|
func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
class string
|
name string
|
||||||
want bool
|
class string
|
||||||
|
vendor string
|
||||||
|
device string
|
||||||
|
want bool
|
||||||
}{
|
}{
|
||||||
{"USB controller", false},
|
{name: "usb", class: "USB controller", want: false},
|
||||||
{"System peripheral", false},
|
{name: "system peripheral", class: "System peripheral", want: false},
|
||||||
{"Audio device", false},
|
{name: "audio", class: "Audio device", want: false},
|
||||||
{"Host bridge", false},
|
{name: "host bridge", class: "Host bridge", want: false},
|
||||||
{"PCI bridge", false},
|
{name: "pci bridge", class: "PCI bridge", want: false},
|
||||||
{"SMBus", false},
|
{name: "smbus", class: "SMBus", want: false},
|
||||||
{"Ethernet controller", true},
|
{name: "perf", class: "Performance counters", want: false},
|
||||||
{"RAID bus controller", true},
|
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
||||||
{"Non-Volatile memory controller", true},
|
{name: "amd dummy function", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse PTDMA", want: false},
|
||||||
{"VGA compatible controller", true},
|
{name: "amd pspcpp", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse Cryptographic Coprocessor PSPCPP", want: false},
|
||||||
|
{name: "ethernet", class: "Ethernet controller", want: true},
|
||||||
|
{name: "raid", class: "RAID bus controller", want: true},
|
||||||
|
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||||
|
{name: "vga", class: "VGA compatible controller", want: true},
|
||||||
|
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
got := shouldIncludePCIeDevice(tt.class)
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
if got != tt.want {
|
got := shouldIncludePCIeDevice(tt.class, tt.vendor, tt.device)
|
||||||
t.Fatalf("class %q include=%v want %v", tt.class, got, tt.want)
|
if got != tt.want {
|
||||||
}
|
t.Fatalf("class=%q vendor=%q device=%q include=%v want %v", tt.class, tt.vendor, tt.device, got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
||||||
input := "Slot:\t0000:00:14.0\nClass:\tUSB controller\nVendor:\tIntel Corporation\nDevice:\tUSB 3.0\n\n" +
|
input := "Slot:\t0000:00:14.0\nClass:\tUSB controller\nVendor:\tIntel Corporation\nDevice:\tUSB 3.0\n\n" +
|
||||||
|
"Slot:\t0000:00:18.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
|
||||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||||
|
|
||||||
devs := parseLspci(input)
|
devs := parseLspci(input)
|
||||||
if len(devs) != 1 {
|
if len(devs) != 1 {
|
||||||
t.Fatalf("expected 1 filtered device, got %d", len(devs))
|
t.Fatalf("expected 1 filtered device, got %d", len(devs))
|
||||||
}
|
}
|
||||||
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VGA compatible controller" {
|
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VideoController" {
|
||||||
t.Fatalf("unexpected remaining class: %v", devs[0].DeviceClass)
|
t.Fatalf("unexpected remaining class: %v", devs[0].DeviceClass)
|
||||||
}
|
}
|
||||||
|
if devs[0].Slot == nil || *devs[0].Slot != "0000:65:00.0" {
|
||||||
|
t.Fatalf("slot: got %v", devs[0].Slot)
|
||||||
|
}
|
||||||
|
if devs[0].BDF == nil || *devs[0].BDF != "0000:65:00.0" {
|
||||||
|
t.Fatalf("bdf: got %v", devs[0].BDF)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
|
||||||
|
input := "" +
|
||||||
|
"Slot:\t0000:1a:00.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
|
||||||
|
"Slot:\t0000:1a:00.2\nClass:\tEncryption controller\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PTDMA\n\n" +
|
||||||
|
"Slot:\t0000:05:00.0\nClass:\tEthernet controller\nVendor:\tMellanox Technologies\nDevice:\tMT28908 Family [ConnectX-6]\n\n"
|
||||||
|
|
||||||
|
devs := parseLspci(input)
|
||||||
|
if len(devs) != 1 {
|
||||||
|
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||||
|
}
|
||||||
|
if devs[0].Model == nil || *devs[0].Model != "MT28908 Family [ConnectX-6]" {
|
||||||
|
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||||
|
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||||
|
|
||||||
|
devs := parseLspci(input)
|
||||||
|
data, err := json.Marshal(devs[0])
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("marshal: %v", err)
|
||||||
|
}
|
||||||
|
text := string(data)
|
||||||
|
if !strings.Contains(text, `"slot":"0000:65:00.0"`) {
|
||||||
|
t.Fatalf("json missing slot: %s", text)
|
||||||
|
}
|
||||||
|
if strings.Contains(text, `"bdf"`) {
|
||||||
|
t.Fatalf("json should not emit bdf: %s", text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseLspciUsesNUMANodeFieldWhenSysfsUnavailable(t *testing.T) {
|
||||||
|
input := "Slot:\t0000:65:00.0\nClass:\tEthernet controller\nVendor:\tIntel Corporation\nDevice:\tX710\nNUMANode:\t1\n\n"
|
||||||
|
|
||||||
|
devs := parseLspci(input)
|
||||||
|
if len(devs) != 1 {
|
||||||
|
t.Fatalf("expected 1 device, got %d", len(devs))
|
||||||
|
}
|
||||||
|
if devs[0].NUMANode == nil || *devs[0].NUMANode != 1 {
|
||||||
|
t.Fatalf("numa_node=%v want 1", devs[0].NUMANode)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
raw string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{"2.5 GT/s PCIe", "Gen1"},
|
||||||
|
{"5.0 GT/s PCIe", "Gen2"},
|
||||||
|
{"8.0 GT/s PCIe", "Gen3"},
|
||||||
|
{"16.0 GT/s PCIe", "Gen4"},
|
||||||
|
{"32.0 GT/s PCIe", "Gen5"},
|
||||||
|
{"64.0 GT/s PCIe", "Gen6"},
|
||||||
|
{"unknown", ""},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
if got := normalizePCILinkSpeed(tt.raw); got != tt.want {
|
||||||
|
t.Fatalf("normalizePCILinkSpeed(%q)=%q want %q", tt.raw, got, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
123
audit/internal/collector/pcie_identity.go
Normal file
123
audit/internal/collector/pcie_identity.go
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||||
|
out, err := exec.Command("lspci", "-vv", "-s", bdf).Output()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return string(out), nil
|
||||||
|
}
|
||||||
|
readPCIVPDFile = func(bdf string) ([]byte, error) {
|
||||||
|
return os.ReadFile(filepath.Join("/sys/bus/pci/devices", bdf, "vpd"))
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
func enrichPCIeWithPCISerials(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||||
|
enriched := 0
|
||||||
|
for i := range devs {
|
||||||
|
if !shouldProbePCIeSerial(devs[i]) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
bdf := normalizePCIeBDF(*devs[i].BDF)
|
||||||
|
if bdf == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if serial := queryPCIDeviceSerial(bdf); serial != "" {
|
||||||
|
devs[i].SerialNumber = &serial
|
||||||
|
enriched++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if enriched > 0 {
|
||||||
|
slog.Info("pcie: serials enriched", "count", enriched)
|
||||||
|
}
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
func shouldProbePCIeSerial(dev schema.HardwarePCIeDevice) bool {
|
||||||
|
if dev.BDF == nil || dev.SerialNumber != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if dev.DeviceClass == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
class := strings.TrimSpace(*dev.DeviceClass)
|
||||||
|
return isNICClass(class) || isGPUClass(class)
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryPCIDeviceSerial(bdf string) string {
|
||||||
|
if out, err := queryPCILSPCIDetail(bdf); err == nil {
|
||||||
|
if serial := parseLSPCIDetailSerial(out); serial != "" {
|
||||||
|
return serial
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if raw, err := readPCIVPDFile(bdf); err == nil {
|
||||||
|
return parsePCIVPDSerial(raw)
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseLSPCIDetailSerial(raw string) string {
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
lower := strings.ToLower(line)
|
||||||
|
if !strings.Contains(lower, "serial number:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx := strings.Index(line, ":")
|
||||||
|
if idx < 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if serial := strings.TrimSpace(line[idx+1:]); serial != "" {
|
||||||
|
return serial
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func parsePCIVPDSerial(raw []byte) string {
|
||||||
|
for i := 0; i+3 < len(raw); i++ {
|
||||||
|
if raw[i] != 'S' || raw[i+1] != 'N' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
length := int(raw[i+2])
|
||||||
|
if length <= 0 || length > 64 || i+3+length > len(raw) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value := strings.TrimSpace(strings.Trim(string(raw[i+3:i+3+length]), "\x00"))
|
||||||
|
if !looksLikeSerial(value) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func looksLikeSerial(value string) bool {
|
||||||
|
if len(value) < 4 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
hasAlphaNum := false
|
||||||
|
for _, r := range value {
|
||||||
|
switch {
|
||||||
|
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9':
|
||||||
|
hasAlphaNum = true
|
||||||
|
case strings.ContainsRune(" -_./:", r):
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return hasAlphaNum
|
||||||
|
}
|
||||||
47
audit/internal/collector/pcie_identity_test.go
Normal file
47
audit/internal/collector/pcie_identity_test.go
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEnrichPCIeWithPCISerialsAddsGPUFallback(t *testing.T) {
|
||||||
|
origDetail := queryPCILSPCIDetail
|
||||||
|
origVPD := readPCIVPDFile
|
||||||
|
t.Cleanup(func() {
|
||||||
|
queryPCILSPCIDetail = origDetail
|
||||||
|
readPCIVPDFile = origVPD
|
||||||
|
})
|
||||||
|
|
||||||
|
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||||
|
if bdf != "0000:11:00.0" {
|
||||||
|
t.Fatalf("unexpected bdf: %s", bdf)
|
||||||
|
}
|
||||||
|
return "Serial number: GPU-SN-12345\n", nil
|
||||||
|
}
|
||||||
|
readPCIVPDFile = func(string) ([]byte, error) {
|
||||||
|
return nil, fmt.Errorf("no vpd needed")
|
||||||
|
}
|
||||||
|
|
||||||
|
class := "DisplayController"
|
||||||
|
bdf := "0000:11:00.0"
|
||||||
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
|
DeviceClass: &class,
|
||||||
|
BDF: &bdf,
|
||||||
|
}}
|
||||||
|
|
||||||
|
out := enrichPCIeWithPCISerials(devs)
|
||||||
|
if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-SN-12345" {
|
||||||
|
t.Fatalf("serial=%v want GPU-SN-12345", out[0].SerialNumber)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestShouldProbePCIeSerialSkipsNonGPUOrNIC(t *testing.T) {
|
||||||
|
class := "StorageController"
|
||||||
|
bdf := "0000:19:00.0"
|
||||||
|
dev := schema.HardwarePCIeDevice{DeviceClass: &class, BDF: &bdf}
|
||||||
|
if shouldProbePCIeSerial(dev) {
|
||||||
|
t.Fatal("unexpected probe for storage controller")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,18 +4,32 @@ import (
|
|||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"regexp"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func collectPSUs() []schema.HardwarePowerSupply {
|
func collectPSUs() []schema.HardwarePowerSupply {
|
||||||
// ipmitool requires /dev/ipmi0 — not available on non-server hardware
|
var psus []schema.HardwarePowerSupply
|
||||||
out, err := exec.Command("ipmitool", "fru", "print").Output()
|
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
|
||||||
if err != nil {
|
psus = parseFRU(string(out))
|
||||||
|
} else {
|
||||||
|
slog.Info("psu: fru unavailable", "err", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
sdrData := map[int]psuSDR{}
|
||||||
|
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||||
|
sdrData = parsePSUSDR(string(sdrOut))
|
||||||
|
if len(psus) == 0 {
|
||||||
|
psus = synthesizePSUsFromSDR(sdrData)
|
||||||
|
} else {
|
||||||
|
mergePSUSDR(psus, sdrData)
|
||||||
|
}
|
||||||
|
} else if len(psus) == 0 {
|
||||||
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
psus := parseFRU(string(out))
|
|
||||||
slog.Info("psu: collected", "count", len(psus))
|
slog.Info("psu: collected", "count", len(psus))
|
||||||
return psus
|
return psus
|
||||||
}
|
}
|
||||||
@@ -75,9 +89,7 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
|||||||
|
|
||||||
// Only process PSU FRU records
|
// Only process PSU FRU records
|
||||||
headerLower := strings.ToLower(header)
|
headerLower := strings.ToLower(header)
|
||||||
if !strings.Contains(headerLower, "psu") &&
|
if !isPSUHeader(headerLower) {
|
||||||
!strings.Contains(headerLower, "power supply") &&
|
|
||||||
!strings.Contains(headerLower, "power_supply") {
|
|
||||||
return schema.HardwarePowerSupply{}, false
|
return schema.HardwarePowerSupply{}, false
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -85,21 +97,24 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
|||||||
psu := schema.HardwarePowerSupply{Present: &present}
|
psu := schema.HardwarePowerSupply{Present: &present}
|
||||||
|
|
||||||
slotStr := strconv.Itoa(slotIdx)
|
slotStr := strconv.Itoa(slotIdx)
|
||||||
|
if slot, ok := parsePSUSlot(header); ok && slot > 0 {
|
||||||
|
slotStr = strconv.Itoa(slot - 1)
|
||||||
|
}
|
||||||
psu.Slot = &slotStr
|
psu.Slot = &slotStr
|
||||||
|
|
||||||
if v := cleanDMIValue(fields["Board Product"]); v != "" {
|
if v := firstNonEmptyField(fields, "Board Product", "Product Name", "Product Part Number"); v != "" {
|
||||||
psu.Model = &v
|
psu.Model = &v
|
||||||
}
|
}
|
||||||
if v := cleanDMIValue(fields["Board Mfg"]); v != "" {
|
if v := firstNonEmptyField(fields, "Board Mfg", "Product Manufacturer", "Product Manufacturer Name"); v != "" {
|
||||||
psu.Vendor = &v
|
psu.Vendor = &v
|
||||||
}
|
}
|
||||||
if v := cleanDMIValue(fields["Board Serial"]); v != "" {
|
if v := firstNonEmptyField(fields, "Board Serial", "Product Serial", "Product Serial Number"); v != "" {
|
||||||
psu.SerialNumber = &v
|
psu.SerialNumber = &v
|
||||||
}
|
}
|
||||||
if v := cleanDMIValue(fields["Board Part Number"]); v != "" {
|
if v := firstNonEmptyField(fields, "Board Part Number", "Product Part Number", "Part Number"); v != "" {
|
||||||
psu.PartNumber = &v
|
psu.PartNumber = &v
|
||||||
}
|
}
|
||||||
if v := cleanDMIValue(fields["Board Extra"]); v != "" {
|
if v := firstNonEmptyField(fields, "Board Extra", "Product Version", "Board Version"); v != "" {
|
||||||
psu.Firmware = &v
|
psu.Firmware = &v
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -110,12 +125,230 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
status := "OK"
|
status := statusOK
|
||||||
psu.Status = &status
|
psu.Status = &status
|
||||||
|
|
||||||
return psu, true
|
return psu, true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isPSUHeader(headerLower string) bool {
|
||||||
|
return strings.Contains(headerLower, "psu") ||
|
||||||
|
strings.Contains(headerLower, "pws") ||
|
||||||
|
strings.Contains(headerLower, "power supply") ||
|
||||||
|
strings.Contains(headerLower, "power_supply") ||
|
||||||
|
strings.Contains(headerLower, "power module")
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstNonEmptyField(fields map[string]string, keys ...string) string {
|
||||||
|
for _, key := range keys {
|
||||||
|
if value := cleanDMIValue(fields[key]); value != "" {
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
type psuSDR struct {
|
||||||
|
slot int
|
||||||
|
status string
|
||||||
|
reason string
|
||||||
|
inputPowerW *float64
|
||||||
|
outputPowerW *float64
|
||||||
|
inputVoltage *float64
|
||||||
|
temperatureC *float64
|
||||||
|
healthPct *float64
|
||||||
|
}
|
||||||
|
|
||||||
|
var psuSlotPatterns = []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
|
||||||
|
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
|
||||||
|
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
|
||||||
|
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
|
||||||
|
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
|
||||||
|
}
|
||||||
|
|
||||||
|
func parsePSUSDR(raw string) map[int]psuSDR {
|
||||||
|
out := map[int]psuSDR{}
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
fields := splitSDRFields(line)
|
||||||
|
if len(fields) < 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := fields[0]
|
||||||
|
value := fields[1]
|
||||||
|
state := strings.ToLower(fields[2])
|
||||||
|
slot, ok := parsePSUSlot(name)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
entry := out[slot]
|
||||||
|
entry.slot = slot
|
||||||
|
if entry.status == "" {
|
||||||
|
entry.status = statusOK
|
||||||
|
}
|
||||||
|
if state != "" && state != "ok" && state != "ns" {
|
||||||
|
entry.status = statusCritical
|
||||||
|
entry.reason = "PSU sensor reported non-OK state: " + state
|
||||||
|
}
|
||||||
|
|
||||||
|
lowerName := strings.ToLower(name)
|
||||||
|
switch {
|
||||||
|
case strings.Contains(lowerName, "input power"):
|
||||||
|
entry.inputPowerW = parseFloatPtr(value)
|
||||||
|
case strings.Contains(lowerName, "output power"):
|
||||||
|
entry.outputPowerW = parseFloatPtr(value)
|
||||||
|
case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
|
||||||
|
entry.outputPowerW = parseFloatPtr(value)
|
||||||
|
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||||
|
entry.inputVoltage = parseFloatPtr(value)
|
||||||
|
case strings.Contains(lowerName, "temp"):
|
||||||
|
entry.temperatureC = parseFloatPtr(value)
|
||||||
|
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
||||||
|
entry.healthPct = parsePercentPtr(value)
|
||||||
|
}
|
||||||
|
out[slot] = entry
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
||||||
|
if len(sdr) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
slots := make([]int, 0, len(sdr))
|
||||||
|
for slot := range sdr {
|
||||||
|
slots = append(slots, slot)
|
||||||
|
}
|
||||||
|
sort.Ints(slots)
|
||||||
|
|
||||||
|
out := make([]schema.HardwarePowerSupply, 0, len(slots))
|
||||||
|
for _, slot := range slots {
|
||||||
|
entry := sdr[slot]
|
||||||
|
present := true
|
||||||
|
status := entry.status
|
||||||
|
if status == "" {
|
||||||
|
status = statusUnknown
|
||||||
|
}
|
||||||
|
slotStr := strconv.Itoa(slot - 1)
|
||||||
|
model := "PSU"
|
||||||
|
psu := schema.HardwarePowerSupply{
|
||||||
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
|
Slot: &slotStr,
|
||||||
|
Present: &present,
|
||||||
|
Model: &model,
|
||||||
|
InputPowerW: entry.inputPowerW,
|
||||||
|
OutputPowerW: entry.outputPowerW,
|
||||||
|
InputVoltage: entry.inputVoltage,
|
||||||
|
TemperatureC: entry.temperatureC,
|
||||||
|
}
|
||||||
|
if entry.healthPct != nil {
|
||||||
|
psu.LifeRemainingPct = entry.healthPct
|
||||||
|
lifeUsed := 100 - *entry.healthPct
|
||||||
|
psu.LifeUsedPct = &lifeUsed
|
||||||
|
}
|
||||||
|
if entry.reason != "" {
|
||||||
|
psu.ErrorDescription = &entry.reason
|
||||||
|
}
|
||||||
|
out = append(out, psu)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergePSUSDR(psus []schema.HardwarePowerSupply, sdr map[int]psuSDR) {
|
||||||
|
for i := range psus {
|
||||||
|
slotIdx, err := strconv.Atoi(derefPSUSlot(psus[i].Slot))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
entry, ok := sdr[slotIdx+1]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if entry.inputPowerW != nil {
|
||||||
|
psus[i].InputPowerW = entry.inputPowerW
|
||||||
|
}
|
||||||
|
if entry.outputPowerW != nil {
|
||||||
|
psus[i].OutputPowerW = entry.outputPowerW
|
||||||
|
}
|
||||||
|
if entry.inputVoltage != nil {
|
||||||
|
psus[i].InputVoltage = entry.inputVoltage
|
||||||
|
}
|
||||||
|
if entry.temperatureC != nil {
|
||||||
|
psus[i].TemperatureC = entry.temperatureC
|
||||||
|
}
|
||||||
|
if entry.healthPct != nil {
|
||||||
|
psus[i].LifeRemainingPct = entry.healthPct
|
||||||
|
lifeUsed := 100 - *entry.healthPct
|
||||||
|
psus[i].LifeUsedPct = &lifeUsed
|
||||||
|
}
|
||||||
|
if entry.status != "" {
|
||||||
|
psus[i].Status = &entry.status
|
||||||
|
}
|
||||||
|
if entry.reason != "" {
|
||||||
|
psus[i].ErrorDescription = &entry.reason
|
||||||
|
}
|
||||||
|
if psus[i].Status != nil && *psus[i].Status == statusOK {
|
||||||
|
if (entry.inputPowerW == nil && entry.outputPowerW == nil && entry.inputVoltage == nil) && entry.status == "" {
|
||||||
|
unknown := statusUnknown
|
||||||
|
psus[i].Status = &unknown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func splitSDRFields(line string) []string {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
out := make([]string, 0, len(parts))
|
||||||
|
for _, part := range parts {
|
||||||
|
part = strings.TrimSpace(part)
|
||||||
|
if part != "" {
|
||||||
|
out = append(out, part)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func parsePSUSlot(name string) (int, bool) {
|
||||||
|
for _, re := range psuSlotPatterns {
|
||||||
|
m := re.FindStringSubmatch(strings.ToLower(name))
|
||||||
|
if len(m) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, group := range m[1:] {
|
||||||
|
if group == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(group)
|
||||||
|
if err == nil && n > 0 {
|
||||||
|
return n, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseFloatPtr(raw string) *float64 {
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
if raw == "" || strings.EqualFold(raw, "na") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
for _, field := range strings.Fields(raw) {
|
||||||
|
n, err := strconv.ParseFloat(strings.TrimSpace(field), 64)
|
||||||
|
if err == nil {
|
||||||
|
return &n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func derefPSUSlot(slot *string) string {
|
||||||
|
if slot == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return *slot
|
||||||
|
}
|
||||||
|
|
||||||
// parseWattage extracts wattage from strings like "PSU 800W", "1200W PLATINUM".
|
// parseWattage extracts wattage from strings like "PSU 800W", "1200W PLATINUM".
|
||||||
func parseWattage(s string) int {
|
func parseWattage(s string) int {
|
||||||
s = strings.ToUpper(s)
|
s = strings.ToUpper(s)
|
||||||
|
|||||||
91
audit/internal/collector/psu_sdr_test.go
Normal file
91
audit/internal/collector/psu_sdr_test.go
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestParsePSUSDR(t *testing.T) {
|
||||||
|
raw := `
|
||||||
|
PS1 Input Power | 215 Watts | ok
|
||||||
|
PS1 Output Power | 198 Watts | ok
|
||||||
|
PS1 Input Voltage | 229 Volts | ok
|
||||||
|
PS1 Temp | 39 C | ok
|
||||||
|
PS1 Health | 97 % | ok
|
||||||
|
PS2 Input Power | 0 Watts | cr
|
||||||
|
`
|
||||||
|
|
||||||
|
got := parsePSUSDR(raw)
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("len(got)=%d want 2", len(got))
|
||||||
|
}
|
||||||
|
if got[1].status != statusOK {
|
||||||
|
t.Fatalf("ps1 status=%q", got[1].status)
|
||||||
|
}
|
||||||
|
if got[1].inputPowerW == nil || *got[1].inputPowerW != 215 {
|
||||||
|
t.Fatalf("ps1 input power=%v", got[1].inputPowerW)
|
||||||
|
}
|
||||||
|
if got[1].outputPowerW == nil || *got[1].outputPowerW != 198 {
|
||||||
|
t.Fatalf("ps1 output power=%v", got[1].outputPowerW)
|
||||||
|
}
|
||||||
|
if got[1].inputVoltage == nil || *got[1].inputVoltage != 229 {
|
||||||
|
t.Fatalf("ps1 input voltage=%v", got[1].inputVoltage)
|
||||||
|
}
|
||||||
|
if got[1].temperatureC == nil || *got[1].temperatureC != 39 {
|
||||||
|
t.Fatalf("ps1 temperature=%v", got[1].temperatureC)
|
||||||
|
}
|
||||||
|
if got[1].healthPct == nil || *got[1].healthPct != 97 {
|
||||||
|
t.Fatalf("ps1 health=%v", got[1].healthPct)
|
||||||
|
}
|
||||||
|
if got[2].status != statusCritical {
|
||||||
|
t.Fatalf("ps2 status=%q", got[2].status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
want int
|
||||||
|
}{
|
||||||
|
{name: "PWS1 Status", want: 1},
|
||||||
|
{name: "Power Supply Bay 8", want: 8},
|
||||||
|
{name: "PS 6 Input Power", want: 6},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
got, ok := parsePSUSlot(tt.name)
|
||||||
|
if !ok || got != tt.want {
|
||||||
|
t.Fatalf("parsePSUSlot(%q)=(%d,%v) want (%d,true)", tt.name, got, ok, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
health := 97.0
|
||||||
|
outputPower := 915.0
|
||||||
|
got := synthesizePSUsFromSDR(map[int]psuSDR{
|
||||||
|
1: {
|
||||||
|
slot: 1,
|
||||||
|
status: statusOK,
|
||||||
|
outputPowerW: &outputPower,
|
||||||
|
healthPct: &health,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(got) != 1 {
|
||||||
|
t.Fatalf("len(got)=%d want 1", len(got))
|
||||||
|
}
|
||||||
|
if got[0].Slot == nil || *got[0].Slot != "0" {
|
||||||
|
t.Fatalf("slot=%v want 0", got[0].Slot)
|
||||||
|
}
|
||||||
|
if got[0].OutputPowerW == nil || *got[0].OutputPowerW != 915 {
|
||||||
|
t.Fatalf("output power=%v", got[0].OutputPowerW)
|
||||||
|
}
|
||||||
|
if got[0].LifeRemainingPct == nil || *got[0].LifeRemainingPct != 97 {
|
||||||
|
t.Fatalf("life remaining=%v", got[0].LifeRemainingPct)
|
||||||
|
}
|
||||||
|
if got[0].LifeUsedPct == nil || *got[0].LifeUsedPct != 3 {
|
||||||
|
t.Fatalf("life used=%v", got[0].LifeUsedPct)
|
||||||
|
}
|
||||||
|
}
|
||||||
121
audit/internal/collector/psu_telemetry.go
Normal file
121
audit/internal/collector/psu_telemetry.go
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func enrichPSUsWithTelemetry(psus []schema.HardwarePowerSupply, doc sensorsDoc) []schema.HardwarePowerSupply {
|
||||||
|
if len(psus) == 0 || len(doc) == 0 {
|
||||||
|
return psus
|
||||||
|
}
|
||||||
|
|
||||||
|
tempBySlot := psuTempsFromSensors(doc)
|
||||||
|
healthBySlot := psuHealthFromSensors(doc)
|
||||||
|
for i := range psus {
|
||||||
|
slot := derefPSUSlot(psus[i].Slot)
|
||||||
|
if slot == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if psus[i].TemperatureC == nil {
|
||||||
|
if value, ok := tempBySlot[slot]; ok {
|
||||||
|
psus[i].TemperatureC = &value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if psus[i].LifeRemainingPct == nil {
|
||||||
|
if value, ok := healthBySlot[slot]; ok {
|
||||||
|
psus[i].LifeRemainingPct = &value
|
||||||
|
used := 100 - value
|
||||||
|
psus[i].LifeUsedPct = &used
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return psus
|
||||||
|
}
|
||||||
|
|
||||||
|
func psuHealthFromSensors(doc sensorsDoc) map[string]float64 {
|
||||||
|
out := map[string]float64{}
|
||||||
|
for chip, features := range doc {
|
||||||
|
for featureName, raw := range features {
|
||||||
|
feature, ok := raw.(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !isLikelyPSUHealth(chip, featureName) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, ok := firstFeaturePercent(feature)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if slot, ok := detectPSUSlot(chip, featureName); ok {
|
||||||
|
if _, exists := out[slot]; !exists {
|
||||||
|
out[slot] = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeaturePercent(feature map[string]any) (float64, bool) {
|
||||||
|
keys := sortedFeatureKeys(feature)
|
||||||
|
for _, key := range keys {
|
||||||
|
lower := strings.ToLower(key)
|
||||||
|
if strings.HasSuffix(lower, "_alarm") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.Contains(lower, "health") || strings.Contains(lower, "life") || strings.Contains(lower, "remain") {
|
||||||
|
if value, ok := floatFromAny(feature[key]); ok {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyPSUHealth(chip, feature string) bool {
|
||||||
|
value := strings.ToLower(chip + " " + feature)
|
||||||
|
return (strings.Contains(value, "psu") || strings.Contains(value, "power supply")) &&
|
||||||
|
(strings.Contains(value, "health") || strings.Contains(value, "life") || strings.Contains(value, "remain"))
|
||||||
|
}
|
||||||
|
|
||||||
|
func psuTempsFromSensors(doc sensorsDoc) map[string]float64 {
|
||||||
|
out := map[string]float64{}
|
||||||
|
for chip, features := range doc {
|
||||||
|
for featureName, raw := range features {
|
||||||
|
feature, ok := raw.(map[string]any)
|
||||||
|
if !ok || classifySensorFeature(feature) != "temp" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !isLikelyPSUTemp(chip, featureName) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
temp, ok := firstFeatureFloat(feature, "_input")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if slot, ok := detectPSUSlot(chip, featureName); ok {
|
||||||
|
if _, exists := out[slot]; !exists {
|
||||||
|
out[slot] = temp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyPSUTemp(chip, feature string) bool {
|
||||||
|
value := strings.ToLower(chip + " " + feature)
|
||||||
|
return strings.Contains(value, "psu") || strings.Contains(value, "power supply")
|
||||||
|
}
|
||||||
|
|
||||||
|
func detectPSUSlot(parts ...string) (string, bool) {
|
||||||
|
for _, part := range parts {
|
||||||
|
if value, ok := parsePSUSlot(part); ok && value > 0 {
|
||||||
|
return strconv.Itoa(value - 1), true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
42
audit/internal/collector/psu_telemetry_test.go
Normal file
42
audit/internal/collector/psu_telemetry_test.go
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEnrichPSUsWithTelemetry(t *testing.T) {
|
||||||
|
slot0 := "0"
|
||||||
|
slot1 := "1"
|
||||||
|
psus := []schema.HardwarePowerSupply{
|
||||||
|
{Slot: &slot0},
|
||||||
|
{Slot: &slot1},
|
||||||
|
}
|
||||||
|
|
||||||
|
doc := sensorsDoc{
|
||||||
|
"psu-hwmon-0": {
|
||||||
|
"PSU1 Temp": map[string]any{"temp1_input": 39.5},
|
||||||
|
"PSU2 Temp": map[string]any{"temp2_input": 41.0},
|
||||||
|
"PSU1 Health": map[string]any{"health1_input": 98.0},
|
||||||
|
"PSU2 Remaining Life": map[string]any{"life2_input": 95.0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := enrichPSUsWithTelemetry(psus, doc)
|
||||||
|
if got[0].TemperatureC == nil || *got[0].TemperatureC != 39.5 {
|
||||||
|
t.Fatalf("psu0 temperature mismatch: %#v", got[0].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[1].TemperatureC == nil || *got[1].TemperatureC != 41.0 {
|
||||||
|
t.Fatalf("psu1 temperature mismatch: %#v", got[1].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[0].LifeRemainingPct == nil || *got[0].LifeRemainingPct != 98.0 {
|
||||||
|
t.Fatalf("psu0 life remaining mismatch: %#v", got[0].LifeRemainingPct)
|
||||||
|
}
|
||||||
|
if got[0].LifeUsedPct == nil || *got[0].LifeUsedPct != 2.0 {
|
||||||
|
t.Fatalf("psu0 life used mismatch: %#v", got[0].LifeUsedPct)
|
||||||
|
}
|
||||||
|
if got[1].LifeRemainingPct == nil || *got[1].LifeRemainingPct != 95.0 {
|
||||||
|
t.Fatalf("psu1 life remaining mismatch: %#v", got[1].LifeRemainingPct)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -83,11 +83,7 @@ func isLikelyRAIDController(dev schema.HardwarePCIeDevice) bool {
|
|||||||
if dev.DeviceClass == nil {
|
if dev.DeviceClass == nil {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
c := strings.ToLower(*dev.DeviceClass)
|
return isRAIDClass(*dev.DeviceClass)
|
||||||
return strings.Contains(c, "raid") ||
|
|
||||||
strings.Contains(c, "sas") ||
|
|
||||||
strings.Contains(c, "mass storage") ||
|
|
||||||
strings.Contains(c, "serial attached scsi")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func collectStorcliDrives() []schema.HardwareStorage {
|
func collectStorcliDrives() []schema.HardwareStorage {
|
||||||
@@ -182,7 +178,10 @@ func parseSASIrcuDisplay(raw string) []schema.HardwareStorage {
|
|||||||
|
|
||||||
present := true
|
present := true
|
||||||
status := mapRAIDDriveStatus(b["State"])
|
status := mapRAIDDriveStatus(b["State"])
|
||||||
s := schema.HardwareStorage{Present: &present, Status: &status}
|
s := schema.HardwareStorage{
|
||||||
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
|
Present: &present,
|
||||||
|
}
|
||||||
|
|
||||||
enclosure := strings.TrimSpace(b["Enclosure #"])
|
enclosure := strings.TrimSpace(b["Enclosure #"])
|
||||||
slot := strings.TrimSpace(b["Slot #"])
|
slot := strings.TrimSpace(b["Slot #"])
|
||||||
@@ -281,7 +280,10 @@ func parseArcconfPhysicalDrives(raw string) []schema.HardwareStorage {
|
|||||||
for _, b := range blocks {
|
for _, b := range blocks {
|
||||||
present := true
|
present := true
|
||||||
status := mapRAIDDriveStatus(b["State"])
|
status := mapRAIDDriveStatus(b["State"])
|
||||||
s := schema.HardwareStorage{Present: &present, Status: &status}
|
s := schema.HardwareStorage{
|
||||||
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
|
Present: &present,
|
||||||
|
}
|
||||||
|
|
||||||
if v := strings.TrimSpace(b["Reported Location"]); v != "" {
|
if v := strings.TrimSpace(b["Reported Location"]); v != "" {
|
||||||
s.Slot = &v
|
s.Slot = &v
|
||||||
@@ -362,8 +364,11 @@ func parseSSACLIPhysicalDrives(raw string) []schema.HardwareStorage {
|
|||||||
if m := ssacliPhysicalDriveLine.FindStringSubmatch(trimmed); len(m) == 3 {
|
if m := ssacliPhysicalDriveLine.FindStringSubmatch(trimmed); len(m) == 3 {
|
||||||
flush()
|
flush()
|
||||||
present := true
|
present := true
|
||||||
status := "UNKNOWN"
|
status := statusUnknown
|
||||||
s := schema.HardwareStorage{Present: &present, Status: &status}
|
s := schema.HardwareStorage{
|
||||||
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
|
Present: &present,
|
||||||
|
}
|
||||||
slot := m[1]
|
slot := m[1]
|
||||||
s.Slot = &slot
|
s.Slot = &slot
|
||||||
|
|
||||||
@@ -475,8 +480,8 @@ func storcliDriveToStorage(d struct {
|
|||||||
present := true
|
present := true
|
||||||
status := mapRAIDDriveStatus(d.State)
|
status := mapRAIDDriveStatus(d.State)
|
||||||
s := schema.HardwareStorage{
|
s := schema.HardwareStorage{
|
||||||
Present: &present,
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
Status: &status,
|
Present: &present,
|
||||||
}
|
}
|
||||||
|
|
||||||
if v := strings.TrimSpace(d.EIDSlt); v != "" {
|
if v := strings.TrimSpace(d.EIDSlt); v != "" {
|
||||||
@@ -527,15 +532,15 @@ func mapRAIDDriveStatus(raw string) string {
|
|||||||
u := strings.ToUpper(strings.TrimSpace(raw))
|
u := strings.ToUpper(strings.TrimSpace(raw))
|
||||||
switch {
|
switch {
|
||||||
case strings.Contains(u, "OK"), strings.Contains(u, "OPTIMAL"), strings.Contains(u, "READY"):
|
case strings.Contains(u, "OK"), strings.Contains(u, "OPTIMAL"), strings.Contains(u, "READY"):
|
||||||
return "OK"
|
return statusOK
|
||||||
case strings.Contains(u, "ONLN"), strings.Contains(u, "ONLINE"):
|
case strings.Contains(u, "ONLN"), strings.Contains(u, "ONLINE"):
|
||||||
return "OK"
|
return statusOK
|
||||||
case strings.Contains(u, "RBLD"), strings.Contains(u, "REBUILD"):
|
case strings.Contains(u, "RBLD"), strings.Contains(u, "REBUILD"):
|
||||||
return "WARNING"
|
return statusWarning
|
||||||
case strings.Contains(u, "FAIL"), strings.Contains(u, "OFFLINE"):
|
case strings.Contains(u, "FAIL"), strings.Contains(u, "OFFLINE"):
|
||||||
return "CRITICAL"
|
return statusCritical
|
||||||
default:
|
default:
|
||||||
return "UNKNOWN"
|
return statusUnknown
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -641,8 +646,9 @@ func enrichStorageWithVROC(storage []schema.HardwareStorage, pcie []schema.Hardw
|
|||||||
storage[i].Telemetry["vroc_array"] = arr.Name
|
storage[i].Telemetry["vroc_array"] = arr.Name
|
||||||
storage[i].Telemetry["vroc_degraded"] = arr.Degraded
|
storage[i].Telemetry["vroc_degraded"] = arr.Degraded
|
||||||
if arr.Degraded {
|
if arr.Degraded {
|
||||||
status := "WARNING"
|
status := statusWarning
|
||||||
storage[i].Status = &status
|
storage[i].Status = &status
|
||||||
|
storage[i].ErrorDescription = stringPtr("VROC array is degraded")
|
||||||
}
|
}
|
||||||
updated++
|
updated++
|
||||||
}
|
}
|
||||||
@@ -659,14 +665,14 @@ func hasVROCController(pcie []schema.HardwarePCIeDevice) bool {
|
|||||||
|
|
||||||
class := ""
|
class := ""
|
||||||
if dev.DeviceClass != nil {
|
if dev.DeviceClass != nil {
|
||||||
class = strings.ToLower(*dev.DeviceClass)
|
class = strings.TrimSpace(*dev.DeviceClass)
|
||||||
}
|
}
|
||||||
model := ""
|
model := ""
|
||||||
if dev.Model != nil {
|
if dev.Model != nil {
|
||||||
model = strings.ToLower(*dev.Model)
|
model = strings.ToLower(*dev.Model)
|
||||||
}
|
}
|
||||||
|
|
||||||
if strings.Contains(class, "raid") ||
|
if isRAIDClass(class) ||
|
||||||
strings.Contains(model, "vroc") ||
|
strings.Contains(model, "vroc") ||
|
||||||
strings.Contains(model, "volume management device") ||
|
strings.Contains(model, "volume management device") ||
|
||||||
strings.Contains(model, "vmd") {
|
strings.Contains(model, "vmd") {
|
||||||
|
|||||||
334
audit/internal/collector/raid_controller_telemetry.go
Normal file
334
audit/internal/collector/raid_controller_telemetry.go
Normal file
@@ -0,0 +1,334 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"encoding/json"
|
||||||
|
"log/slog"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type raidControllerTelemetry struct {
|
||||||
|
BatteryChargePct *float64
|
||||||
|
BatteryHealthPct *float64
|
||||||
|
BatteryTemperatureC *float64
|
||||||
|
BatteryVoltageV *float64
|
||||||
|
BatteryReplaceRequired *bool
|
||||||
|
ErrorDescription *string
|
||||||
|
}
|
||||||
|
|
||||||
|
func enrichPCIeWithRAIDTelemetry(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||||
|
byVendor := collectRAIDControllerTelemetry()
|
||||||
|
if len(byVendor) == 0 {
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
positions := map[int]int{}
|
||||||
|
for i := range devs {
|
||||||
|
if devs[i].VendorID == nil || !isLikelyRAIDController(devs[i]) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vendor := *devs[i].VendorID
|
||||||
|
list := byVendor[vendor]
|
||||||
|
if len(list) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
index := positions[vendor]
|
||||||
|
if index >= len(list) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
positions[vendor] = index + 1
|
||||||
|
applyRAIDControllerTelemetry(&devs[i], list[index])
|
||||||
|
}
|
||||||
|
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyRAIDControllerTelemetry(dev *schema.HardwarePCIeDevice, tel raidControllerTelemetry) {
|
||||||
|
if tel.BatteryChargePct != nil {
|
||||||
|
dev.BatteryChargePct = tel.BatteryChargePct
|
||||||
|
}
|
||||||
|
if tel.BatteryHealthPct != nil {
|
||||||
|
dev.BatteryHealthPct = tel.BatteryHealthPct
|
||||||
|
}
|
||||||
|
if tel.BatteryTemperatureC != nil {
|
||||||
|
dev.BatteryTemperatureC = tel.BatteryTemperatureC
|
||||||
|
}
|
||||||
|
if tel.BatteryVoltageV != nil {
|
||||||
|
dev.BatteryVoltageV = tel.BatteryVoltageV
|
||||||
|
}
|
||||||
|
if tel.BatteryReplaceRequired != nil {
|
||||||
|
dev.BatteryReplaceRequired = tel.BatteryReplaceRequired
|
||||||
|
}
|
||||||
|
if tel.ErrorDescription != nil {
|
||||||
|
dev.ErrorDescription = tel.ErrorDescription
|
||||||
|
if dev.Status == nil || *dev.Status == statusOK {
|
||||||
|
status := statusWarning
|
||||||
|
dev.Status = &status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectRAIDControllerTelemetry() map[int][]raidControllerTelemetry {
|
||||||
|
out := map[int][]raidControllerTelemetry{}
|
||||||
|
|
||||||
|
if raw, err := raidToolQuery("storcli64", "/call", "show", "all", "J"); err == nil {
|
||||||
|
list := parseStorcliControllerTelemetry(raw)
|
||||||
|
if len(list) > 0 {
|
||||||
|
out[vendorBroadcomLSI] = append(out[vendorBroadcomLSI], list...)
|
||||||
|
slog.Info("raid: storcli controller telemetry", "count", len(list))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, err := raidToolQuery("ssacli", "ctrl", "all", "show", "config", "detail"); err == nil {
|
||||||
|
list := parseSSACLIControllerTelemetry(string(raw))
|
||||||
|
if len(list) > 0 {
|
||||||
|
out[vendorHPE] = append(out[vendorHPE], list...)
|
||||||
|
slog.Info("raid: ssacli controller telemetry", "count", len(list))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, err := raidToolQuery("arcconf", "getconfig", "1", "ad"); err == nil {
|
||||||
|
list := parseArcconfControllerTelemetry(string(raw))
|
||||||
|
if len(list) > 0 {
|
||||||
|
out[vendorAdaptec] = append(out[vendorAdaptec], list...)
|
||||||
|
slog.Info("raid: arcconf controller telemetry", "count", len(list))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseStorcliControllerTelemetry(raw []byte) []raidControllerTelemetry {
|
||||||
|
var doc struct {
|
||||||
|
Controllers []struct {
|
||||||
|
ResponseData map[string]any `json:"Response Data"`
|
||||||
|
} `json:"Controllers"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||||
|
slog.Warn("raid: parse storcli controller telemetry failed", "err", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var out []raidControllerTelemetry
|
||||||
|
for _, ctl := range doc.Controllers {
|
||||||
|
tel := raidControllerTelemetry{}
|
||||||
|
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["BBU_Info"]))
|
||||||
|
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["BBU_Info_Details"]))
|
||||||
|
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["CV_Info"]))
|
||||||
|
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["CV_Info_Details"]))
|
||||||
|
if hasRAIDControllerTelemetry(tel) {
|
||||||
|
out = append(out, tel)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func nestedStringMap(raw any) map[string]string {
|
||||||
|
switch value := raw.(type) {
|
||||||
|
case map[string]any:
|
||||||
|
out := map[string]string{}
|
||||||
|
flattenStringMap("", value, out)
|
||||||
|
return out
|
||||||
|
case []any:
|
||||||
|
out := map[string]string{}
|
||||||
|
for _, item := range value {
|
||||||
|
if m, ok := item.(map[string]any); ok {
|
||||||
|
flattenStringMap("", m, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
default:
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func flattenStringMap(prefix string, in map[string]any, out map[string]string) {
|
||||||
|
for key, raw := range in {
|
||||||
|
fullKey := strings.TrimSpace(strings.ToLower(strings.Trim(prefix+" "+key, " ")))
|
||||||
|
switch value := raw.(type) {
|
||||||
|
case map[string]any:
|
||||||
|
flattenStringMap(fullKey, value, out)
|
||||||
|
case []any:
|
||||||
|
for _, item := range value {
|
||||||
|
if m, ok := item.(map[string]any); ok {
|
||||||
|
flattenStringMap(fullKey, m, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case string:
|
||||||
|
out[fullKey] = value
|
||||||
|
case json.Number:
|
||||||
|
out[fullKey] = value.String()
|
||||||
|
case float64:
|
||||||
|
out[fullKey] = strconv.FormatFloat(value, 'f', -1, 64)
|
||||||
|
case bool:
|
||||||
|
if value {
|
||||||
|
out[fullKey] = "true"
|
||||||
|
} else {
|
||||||
|
out[fullKey] = "false"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeStorcliBatteryMap(tel *raidControllerTelemetry, fields map[string]string) {
|
||||||
|
if len(fields) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for key, raw := range fields {
|
||||||
|
lower := strings.ToLower(strings.TrimSpace(key))
|
||||||
|
switch {
|
||||||
|
case strings.Contains(lower, "relative state of charge"), strings.Contains(lower, "remaining capacity"), strings.Contains(lower, "charge"):
|
||||||
|
if tel.BatteryChargePct == nil {
|
||||||
|
tel.BatteryChargePct = parsePercentPtr(raw)
|
||||||
|
}
|
||||||
|
case strings.Contains(lower, "state of health"), strings.Contains(lower, "health"):
|
||||||
|
if tel.BatteryHealthPct == nil {
|
||||||
|
tel.BatteryHealthPct = parsePercentPtr(raw)
|
||||||
|
}
|
||||||
|
case strings.Contains(lower, "temperature"):
|
||||||
|
if tel.BatteryTemperatureC == nil {
|
||||||
|
tel.BatteryTemperatureC = parseFloatPtr(raw)
|
||||||
|
}
|
||||||
|
case strings.Contains(lower, "voltage"):
|
||||||
|
if tel.BatteryVoltageV == nil {
|
||||||
|
tel.BatteryVoltageV = parseFloatPtr(raw)
|
||||||
|
}
|
||||||
|
case strings.Contains(lower, "replace"), strings.Contains(lower, "replacement required"):
|
||||||
|
if tel.BatteryReplaceRequired == nil {
|
||||||
|
tel.BatteryReplaceRequired = parseReplaceRequired(raw)
|
||||||
|
}
|
||||||
|
case strings.Contains(lower, "learn cycle requested"), strings.Contains(lower, "battery state"), strings.Contains(lower, "capacitance state"):
|
||||||
|
if desc := batteryStateDescription(raw); desc != nil && tel.ErrorDescription == nil {
|
||||||
|
tel.ErrorDescription = desc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSSACLIControllerTelemetry(raw string) []raidControllerTelemetry {
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
var out []raidControllerTelemetry
|
||||||
|
var current *raidControllerTelemetry
|
||||||
|
|
||||||
|
flush := func() {
|
||||||
|
if current != nil && hasRAIDControllerTelemetry(*current) {
|
||||||
|
out = append(out, *current)
|
||||||
|
}
|
||||||
|
current = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if trimmed == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(strings.ToLower(trimmed), "smart array") || strings.HasPrefix(strings.ToLower(trimmed), "controller ") {
|
||||||
|
flush()
|
||||||
|
current = &raidControllerTelemetry{}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if current == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if idx := strings.Index(trimmed, ":"); idx > 0 {
|
||||||
|
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||||
|
val := strings.TrimSpace(trimmed[idx+1:])
|
||||||
|
switch {
|
||||||
|
case strings.Contains(key, "capacitor temperature"), strings.Contains(key, "battery temperature"):
|
||||||
|
current.BatteryTemperatureC = parseFloatPtr(val)
|
||||||
|
case strings.Contains(key, "capacitor voltage"), strings.Contains(key, "battery voltage"):
|
||||||
|
current.BatteryVoltageV = parseFloatPtr(val)
|
||||||
|
case strings.Contains(key, "capacitor charge"), strings.Contains(key, "battery charge"):
|
||||||
|
current.BatteryChargePct = parsePercentPtr(val)
|
||||||
|
case strings.Contains(key, "capacitor health"), strings.Contains(key, "battery health"):
|
||||||
|
current.BatteryHealthPct = parsePercentPtr(val)
|
||||||
|
case strings.Contains(key, "replace") || strings.Contains(key, "failed"):
|
||||||
|
if current.BatteryReplaceRequired == nil {
|
||||||
|
current.BatteryReplaceRequired = parseReplaceRequired(val)
|
||||||
|
}
|
||||||
|
if desc := batteryStateDescription(val); desc != nil && current.ErrorDescription == nil {
|
||||||
|
current.ErrorDescription = desc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
flush()
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseArcconfControllerTelemetry(raw string) []raidControllerTelemetry {
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
tel := raidControllerTelemetry{}
|
||||||
|
for _, line := range lines {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if idx := strings.Index(trimmed, ":"); idx > 0 {
|
||||||
|
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||||
|
val := strings.TrimSpace(trimmed[idx+1:])
|
||||||
|
switch {
|
||||||
|
case strings.Contains(key, "battery temperature"), strings.Contains(key, "capacitor temperature"):
|
||||||
|
tel.BatteryTemperatureC = parseFloatPtr(val)
|
||||||
|
case strings.Contains(key, "battery voltage"), strings.Contains(key, "capacitor voltage"):
|
||||||
|
tel.BatteryVoltageV = parseFloatPtr(val)
|
||||||
|
case strings.Contains(key, "battery charge"), strings.Contains(key, "capacitor charge"):
|
||||||
|
tel.BatteryChargePct = parsePercentPtr(val)
|
||||||
|
case strings.Contains(key, "battery health"), strings.Contains(key, "capacitor health"):
|
||||||
|
tel.BatteryHealthPct = parsePercentPtr(val)
|
||||||
|
case strings.Contains(key, "replace"), strings.Contains(key, "failed"):
|
||||||
|
if tel.BatteryReplaceRequired == nil {
|
||||||
|
tel.BatteryReplaceRequired = parseReplaceRequired(val)
|
||||||
|
}
|
||||||
|
if desc := batteryStateDescription(val); desc != nil && tel.ErrorDescription == nil {
|
||||||
|
tel.ErrorDescription = desc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if hasRAIDControllerTelemetry(tel) {
|
||||||
|
return []raidControllerTelemetry{tel}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasRAIDControllerTelemetry(tel raidControllerTelemetry) bool {
|
||||||
|
return tel.BatteryChargePct != nil ||
|
||||||
|
tel.BatteryHealthPct != nil ||
|
||||||
|
tel.BatteryTemperatureC != nil ||
|
||||||
|
tel.BatteryVoltageV != nil ||
|
||||||
|
tel.BatteryReplaceRequired != nil ||
|
||||||
|
tel.ErrorDescription != nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parsePercentPtr(raw string) *float64 {
|
||||||
|
raw = strings.ReplaceAll(strings.TrimSpace(raw), "%", "")
|
||||||
|
return parseFloatPtr(raw)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseReplaceRequired(raw string) *bool {
|
||||||
|
lower := strings.ToLower(strings.TrimSpace(raw))
|
||||||
|
switch {
|
||||||
|
case lower == "":
|
||||||
|
return nil
|
||||||
|
case strings.Contains(lower, "replace"), strings.Contains(lower, "failed"), strings.Contains(lower, "yes"), strings.Contains(lower, "required"):
|
||||||
|
value := true
|
||||||
|
return &value
|
||||||
|
case strings.Contains(lower, "no"), strings.Contains(lower, "ok"), strings.Contains(lower, "good"), strings.Contains(lower, "optimal"):
|
||||||
|
value := false
|
||||||
|
return &value
|
||||||
|
default:
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func batteryStateDescription(raw string) *string {
|
||||||
|
lower := strings.ToLower(strings.TrimSpace(raw))
|
||||||
|
if lower == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case strings.Contains(lower, "failed"), strings.Contains(lower, "fault"), strings.Contains(lower, "replace"), strings.Contains(lower, "warning"), strings.Contains(lower, "degraded"):
|
||||||
|
return &raw
|
||||||
|
default:
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,10 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"errors"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestParseSASIrcuControllerIDs(t *testing.T) {
|
func TestParseSASIrcuControllerIDs(t *testing.T) {
|
||||||
raw := `LSI Corporation SAS2 IR Configuration Utility.
|
raw := `LSI Corporation SAS2 IR Configuration Utility.
|
||||||
@@ -90,7 +94,111 @@ physicaldrive 1I:1:2 (894 GB, SAS HDD, Failed)
|
|||||||
if drives[0].Status == nil || *drives[0].Status != "OK" {
|
if drives[0].Status == nil || *drives[0].Status != "OK" {
|
||||||
t.Fatalf("drive0 status: %v", drives[0].Status)
|
t.Fatalf("drive0 status: %v", drives[0].Status)
|
||||||
}
|
}
|
||||||
if drives[1].Status == nil || *drives[1].Status != "CRITICAL" {
|
if drives[1].Status == nil || *drives[1].Status != statusCritical {
|
||||||
t.Fatalf("drive1 status: %v", drives[1].Status)
|
t.Fatalf("drive1 status: %v", drives[1].Status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseStorcliControllerTelemetry(t *testing.T) {
|
||||||
|
raw := []byte(`{
|
||||||
|
"Controllers": [
|
||||||
|
{
|
||||||
|
"Response Data": {
|
||||||
|
"BBU_Info": {
|
||||||
|
"State of Health": "98 %",
|
||||||
|
"Relative State of Charge": "76 %",
|
||||||
|
"Temperature": "41 C",
|
||||||
|
"Voltage": "12.3 V",
|
||||||
|
"Replacement required": "No"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
got := parseStorcliControllerTelemetry(raw)
|
||||||
|
if len(got) != 1 {
|
||||||
|
t.Fatalf("len(got)=%d want 1", len(got))
|
||||||
|
}
|
||||||
|
if got[0].BatteryHealthPct == nil || *got[0].BatteryHealthPct != 98 {
|
||||||
|
t.Fatalf("battery health=%v", got[0].BatteryHealthPct)
|
||||||
|
}
|
||||||
|
if got[0].BatteryChargePct == nil || *got[0].BatteryChargePct != 76 {
|
||||||
|
t.Fatalf("battery charge=%v", got[0].BatteryChargePct)
|
||||||
|
}
|
||||||
|
if got[0].BatteryTemperatureC == nil || *got[0].BatteryTemperatureC != 41 {
|
||||||
|
t.Fatalf("battery temperature=%v", got[0].BatteryTemperatureC)
|
||||||
|
}
|
||||||
|
if got[0].BatteryVoltageV == nil || *got[0].BatteryVoltageV != 12.3 {
|
||||||
|
t.Fatalf("battery voltage=%v", got[0].BatteryVoltageV)
|
||||||
|
}
|
||||||
|
if got[0].BatteryReplaceRequired == nil || *got[0].BatteryReplaceRequired {
|
||||||
|
t.Fatalf("battery replace=%v", got[0].BatteryReplaceRequired)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseSSACLIControllerTelemetry(t *testing.T) {
|
||||||
|
raw := `Smart Array P440ar in Slot 0
|
||||||
|
Battery/Capacitor Count: 1
|
||||||
|
Capacitor Temperature (C): 37
|
||||||
|
Capacitor Charge (%): 94
|
||||||
|
Capacitor Health (%): 96
|
||||||
|
Capacitor Voltage (V): 9.8
|
||||||
|
Capacitor Failed: No
|
||||||
|
`
|
||||||
|
got := parseSSACLIControllerTelemetry(raw)
|
||||||
|
if len(got) != 1 {
|
||||||
|
t.Fatalf("len(got)=%d want 1", len(got))
|
||||||
|
}
|
||||||
|
if got[0].BatteryTemperatureC == nil || *got[0].BatteryTemperatureC != 37 {
|
||||||
|
t.Fatalf("battery temperature=%v", got[0].BatteryTemperatureC)
|
||||||
|
}
|
||||||
|
if got[0].BatteryChargePct == nil || *got[0].BatteryChargePct != 94 {
|
||||||
|
t.Fatalf("battery charge=%v", got[0].BatteryChargePct)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnrichPCIeWithRAIDTelemetry(t *testing.T) {
|
||||||
|
orig := raidToolQuery
|
||||||
|
t.Cleanup(func() { raidToolQuery = orig })
|
||||||
|
raidToolQuery = func(name string, args ...string) ([]byte, error) {
|
||||||
|
switch name {
|
||||||
|
case "storcli64":
|
||||||
|
return []byte(`{
|
||||||
|
"Controllers": [
|
||||||
|
{
|
||||||
|
"Response Data": {
|
||||||
|
"CV_Info": {
|
||||||
|
"State of Health": "99 %",
|
||||||
|
"Relative State of Charge": "81 %",
|
||||||
|
"Temperature": "38 C",
|
||||||
|
"Voltage": "12.1 V",
|
||||||
|
"Replacement required": "No"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}`), nil
|
||||||
|
default:
|
||||||
|
return nil, errors.New("skip")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vendor := vendorBroadcomLSI
|
||||||
|
class := "MassStorageController"
|
||||||
|
status := statusOK
|
||||||
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
|
VendorID: &vendor,
|
||||||
|
DeviceClass: &class,
|
||||||
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
|
}}
|
||||||
|
out := enrichPCIeWithRAIDTelemetry(devs)
|
||||||
|
if out[0].BatteryHealthPct == nil || *out[0].BatteryHealthPct != 99 {
|
||||||
|
t.Fatalf("battery health=%v", out[0].BatteryHealthPct)
|
||||||
|
}
|
||||||
|
if out[0].BatteryChargePct == nil || *out[0].BatteryChargePct != 81 {
|
||||||
|
t.Fatalf("battery charge=%v", out[0].BatteryChargePct)
|
||||||
|
}
|
||||||
|
if out[0].BatteryVoltageV == nil || *out[0].BatteryVoltageV != 12.1 {
|
||||||
|
t.Fatalf("battery voltage=%v", out[0].BatteryVoltageV)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
373
audit/internal/collector/sensors.go
Normal file
373
audit/internal/collector/sensors.go
Normal file
@@ -0,0 +1,373 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"encoding/json"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type sensorsDoc map[string]map[string]any
|
||||||
|
|
||||||
|
func collectSensors() *schema.HardwareSensors {
|
||||||
|
doc, err := readSensorsJSONDoc()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("sensors: unavailable, skipping", "err", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
sensors := buildSensorsFromDoc(doc)
|
||||||
|
if sensors == nil || (len(sensors.Fans) == 0 && len(sensors.Power) == 0 && len(sensors.Temperatures) == 0 && len(sensors.Other) == 0) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
slog.Info("sensors: collected",
|
||||||
|
"fans", len(sensors.Fans),
|
||||||
|
"power", len(sensors.Power),
|
||||||
|
"temperatures", len(sensors.Temperatures),
|
||||||
|
"other", len(sensors.Other),
|
||||||
|
)
|
||||||
|
return sensors
|
||||||
|
}
|
||||||
|
|
||||||
|
func readSensorsJSONDoc() (sensorsDoc, error) {
|
||||||
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var doc sensorsDoc
|
||||||
|
if err := json.Unmarshal(out, &doc); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return doc, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||||
|
if len(doc) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
result := &schema.HardwareSensors{}
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
|
||||||
|
chips := make([]string, 0, len(doc))
|
||||||
|
for chip := range doc {
|
||||||
|
chips = append(chips, chip)
|
||||||
|
}
|
||||||
|
sort.Strings(chips)
|
||||||
|
|
||||||
|
for _, chip := range chips {
|
||||||
|
features := doc[chip]
|
||||||
|
location := sensorLocation(chip)
|
||||||
|
|
||||||
|
keys := make([]string, 0, len(features))
|
||||||
|
for key := range features {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
|
||||||
|
for _, key := range keys {
|
||||||
|
if strings.EqualFold(key, "Adapter") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
feature, ok := features[key].(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(key)
|
||||||
|
if name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch classifySensorFeature(feature) {
|
||||||
|
case "fan":
|
||||||
|
item := buildFanSensor(name, location, feature)
|
||||||
|
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.Fans = append(result.Fans, *item)
|
||||||
|
case "temp":
|
||||||
|
item := buildTempSensor(name, location, feature)
|
||||||
|
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.Temperatures = append(result.Temperatures, *item)
|
||||||
|
case "power":
|
||||||
|
item := buildPowerSensor(name, location, feature)
|
||||||
|
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.Power = append(result.Power, *item)
|
||||||
|
default:
|
||||||
|
item := buildOtherSensor(name, location, feature)
|
||||||
|
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.Other = append(result.Other, *item)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSensorsJSON(raw []byte) (*schema.HardwareSensors, error) {
|
||||||
|
var doc sensorsDoc
|
||||||
|
err := json.Unmarshal(raw, &doc)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return buildSensorsFromDoc(doc), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
|
||||||
|
key := sensorType + "\x00" + name
|
||||||
|
if _, ok := seen[key]; ok {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
seen[key] = struct{}{}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func sensorLocation(chip string) *string {
|
||||||
|
chip = strings.TrimSpace(chip)
|
||||||
|
if chip == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &chip
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifySensorFeature(feature map[string]any) string {
|
||||||
|
for key := range feature {
|
||||||
|
switch {
|
||||||
|
case strings.Contains(key, "fan") && strings.HasSuffix(key, "_input"):
|
||||||
|
return "fan"
|
||||||
|
case strings.Contains(key, "temp") && strings.HasSuffix(key, "_input"):
|
||||||
|
return "temp"
|
||||||
|
case strings.Contains(key, "power") && (strings.HasSuffix(key, "_input") || strings.HasSuffix(key, "_average")):
|
||||||
|
return "power"
|
||||||
|
case strings.Contains(key, "curr") && strings.HasSuffix(key, "_input"):
|
||||||
|
return "power"
|
||||||
|
case strings.HasPrefix(key, "in") && strings.HasSuffix(key, "_input"):
|
||||||
|
return "power"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "other"
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
|
||||||
|
rpm, ok := firstFeatureInt(feature, "_input")
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
|
||||||
|
if status := sensorStatusFromFeature(feature); status != nil {
|
||||||
|
item.Status = status
|
||||||
|
}
|
||||||
|
return item
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||||
|
celsius, ok := firstFeatureFloat(feature, "_input")
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
|
||||||
|
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
||||||
|
item.ThresholdWarningCelsius = &warning
|
||||||
|
}
|
||||||
|
if critical, ok := firstFeatureFloatWithSuffixes(feature, []string{"_crit", "_emergency"}); ok {
|
||||||
|
item.ThresholdCriticalCelsius = &critical
|
||||||
|
}
|
||||||
|
if status := sensorStatusFromFeature(feature); status != nil {
|
||||||
|
item.Status = status
|
||||||
|
} else {
|
||||||
|
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
|
||||||
|
}
|
||||||
|
return item
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||||
|
item := &schema.HardwarePowerSensor{Name: name, Location: location}
|
||||||
|
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
||||||
|
item.PowerW = &v
|
||||||
|
}
|
||||||
|
if v, ok := firstFeatureFloatWithPrefix(feature, "curr"); ok {
|
||||||
|
item.CurrentA = &v
|
||||||
|
}
|
||||||
|
if v, ok := firstFeatureFloatWithPrefix(feature, "in"); ok {
|
||||||
|
item.VoltageV = &v
|
||||||
|
}
|
||||||
|
if item.PowerW == nil && item.CurrentA == nil && item.VoltageV == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if status := sensorStatusFromFeature(feature); status != nil {
|
||||||
|
item.Status = status
|
||||||
|
}
|
||||||
|
return item
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||||
|
value, unit, ok := firstGenericSensorValue(feature)
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
|
||||||
|
if unit != "" {
|
||||||
|
item.Unit = &unit
|
||||||
|
}
|
||||||
|
if status := sensorStatusFromFeature(feature); status != nil {
|
||||||
|
item.Status = status
|
||||||
|
}
|
||||||
|
return item
|
||||||
|
}
|
||||||
|
|
||||||
|
func sensorStatusFromFeature(feature map[string]any) *string {
|
||||||
|
for key, raw := range feature {
|
||||||
|
if !strings.HasSuffix(key, "_alarm") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if number, ok := floatFromAny(raw); ok && number > 0 {
|
||||||
|
status := statusWarning
|
||||||
|
return &status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func deriveTemperatureStatus(current, warning, critical *float64) *string {
|
||||||
|
if current == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case critical != nil && *current >= *critical:
|
||||||
|
status := statusCritical
|
||||||
|
return &status
|
||||||
|
case warning != nil && *current >= *warning:
|
||||||
|
status := statusWarning
|
||||||
|
return &status
|
||||||
|
default:
|
||||||
|
status := statusOK
|
||||||
|
return &status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeatureInt(feature map[string]any, suffix string) (int, bool) {
|
||||||
|
for key, raw := range feature {
|
||||||
|
if strings.HasSuffix(key, suffix) {
|
||||||
|
if value, ok := floatFromAny(raw); ok {
|
||||||
|
return int(value), true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeatureFloat(feature map[string]any, suffix string) (float64, bool) {
|
||||||
|
return firstFeatureFloatWithSuffixes(feature, []string{suffix})
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeatureFloatWithSuffixes(feature map[string]any, suffixes []string) (float64, bool) {
|
||||||
|
keys := sortedFeatureKeys(feature)
|
||||||
|
for _, key := range keys {
|
||||||
|
for _, suffix := range suffixes {
|
||||||
|
if strings.HasSuffix(key, suffix) {
|
||||||
|
if value, ok := floatFromAny(feature[key]); ok {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeatureFloatWithContains(feature map[string]any, parts []string) (float64, bool) {
|
||||||
|
keys := sortedFeatureKeys(feature)
|
||||||
|
for _, key := range keys {
|
||||||
|
matched := true
|
||||||
|
for _, part := range parts {
|
||||||
|
if !strings.Contains(key, part) {
|
||||||
|
matched = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if matched {
|
||||||
|
if value, ok := floatFromAny(feature[key]); ok {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeatureFloatWithPrefix(feature map[string]any, prefix string) (float64, bool) {
|
||||||
|
keys := sortedFeatureKeys(feature)
|
||||||
|
for _, key := range keys {
|
||||||
|
if strings.HasPrefix(key, prefix) && strings.HasSuffix(key, "_input") {
|
||||||
|
if value, ok := floatFromAny(feature[key]); ok {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstGenericSensorValue(feature map[string]any) (float64, string, bool) {
|
||||||
|
keys := sortedFeatureKeys(feature)
|
||||||
|
for _, key := range keys {
|
||||||
|
if strings.HasSuffix(key, "_alarm") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, ok := floatFromAny(feature[key])
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
unit := inferSensorUnit(key)
|
||||||
|
return value, unit, true
|
||||||
|
}
|
||||||
|
return 0, "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
func inferSensorUnit(key string) string {
|
||||||
|
switch {
|
||||||
|
case strings.Contains(key, "humidity"):
|
||||||
|
return "%"
|
||||||
|
case strings.Contains(key, "intrusion"):
|
||||||
|
return ""
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sortedFeatureKeys(feature map[string]any) []string {
|
||||||
|
keys := make([]string, 0, len(feature))
|
||||||
|
for key := range feature {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
return keys
|
||||||
|
}
|
||||||
|
|
||||||
|
func floatFromAny(raw any) (float64, bool) {
|
||||||
|
switch value := raw.(type) {
|
||||||
|
case float64:
|
||||||
|
return value, true
|
||||||
|
case float32:
|
||||||
|
return float64(value), true
|
||||||
|
case int:
|
||||||
|
return float64(value), true
|
||||||
|
case int64:
|
||||||
|
return float64(value), true
|
||||||
|
case json.Number:
|
||||||
|
if f, err := value.Float64(); err == nil {
|
||||||
|
return f, true
|
||||||
|
}
|
||||||
|
case string:
|
||||||
|
if value == "" {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if f, err := strconv.ParseFloat(value, 64); err == nil {
|
||||||
|
return f, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
54
audit/internal/collector/sensors_test.go
Normal file
54
audit/internal/collector/sensors_test.go
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestParseSensorsJSON(t *testing.T) {
|
||||||
|
raw := []byte(`{
|
||||||
|
"coretemp-isa-0000": {
|
||||||
|
"Adapter": "ISA adapter",
|
||||||
|
"Package id 0": {
|
||||||
|
"temp1_input": 61.5,
|
||||||
|
"temp1_max": 80.0,
|
||||||
|
"temp1_crit": 95.0
|
||||||
|
},
|
||||||
|
"fan1": {
|
||||||
|
"fan1_input": 4200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"acpitz-acpi-0": {
|
||||||
|
"Adapter": "ACPI interface",
|
||||||
|
"in0": {
|
||||||
|
"in0_input": 12.06
|
||||||
|
},
|
||||||
|
"curr1": {
|
||||||
|
"curr1_input": 0.64
|
||||||
|
},
|
||||||
|
"power1": {
|
||||||
|
"power1_average": 137.0
|
||||||
|
},
|
||||||
|
"humidity1": {
|
||||||
|
"humidity1_input": 38.5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`)
|
||||||
|
|
||||||
|
got, err := parseSensorsJSON(raw)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("parseSensorsJSON error: %v", err)
|
||||||
|
}
|
||||||
|
if got == nil {
|
||||||
|
t.Fatal("expected sensors")
|
||||||
|
}
|
||||||
|
if len(got.Temperatures) != 1 || got.Temperatures[0].Celsius == nil || *got.Temperatures[0].Celsius != 61.5 {
|
||||||
|
t.Fatalf("temperatures mismatch: %#v", got.Temperatures)
|
||||||
|
}
|
||||||
|
if len(got.Fans) != 1 || got.Fans[0].RPM == nil || *got.Fans[0].RPM != 4200 {
|
||||||
|
t.Fatalf("fans mismatch: %#v", got.Fans)
|
||||||
|
}
|
||||||
|
if len(got.Power) != 3 {
|
||||||
|
t.Fatalf("power sensors mismatch: %#v", got.Power)
|
||||||
|
}
|
||||||
|
if len(got.Other) != 1 || got.Other[0].Unit == nil || *got.Other[0].Unit != "%" {
|
||||||
|
t.Fatalf("other sensors mismatch: %#v", got.Other)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -5,11 +5,13 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func collectStorage() []schema.HardwareStorage {
|
func collectStorage() []schema.HardwareStorage {
|
||||||
devs := lsblkDevices()
|
devs := discoverStorageDevices()
|
||||||
result := make([]schema.HardwareStorage, 0, len(devs))
|
result := make([]schema.HardwareStorage, 0, len(devs))
|
||||||
for _, dev := range devs {
|
for _, dev := range devs {
|
||||||
var s schema.HardwareStorage
|
var s schema.HardwareStorage
|
||||||
@@ -26,19 +28,60 @@ func collectStorage() []schema.HardwareStorage {
|
|||||||
|
|
||||||
// lsblkDevice is a minimal lsblk JSON record.
|
// lsblkDevice is a minimal lsblk JSON record.
|
||||||
type lsblkDevice struct {
|
type lsblkDevice struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
Size string `json:"size"`
|
Size string `json:"size"`
|
||||||
Serial string `json:"serial"`
|
Serial string `json:"serial"`
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Tran string `json:"tran"`
|
Tran string `json:"tran"`
|
||||||
Hctl string `json:"hctl"`
|
Hctl string `json:"hctl"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type lsblkRoot struct {
|
type lsblkRoot struct {
|
||||||
Blockdevices []lsblkDevice `json:"blockdevices"`
|
Blockdevices []lsblkDevice `json:"blockdevices"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type nvmeListRoot struct {
|
||||||
|
Devices []nvmeListDevice `json:"Devices"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvmeListDevice struct {
|
||||||
|
DevicePath string `json:"DevicePath"`
|
||||||
|
ModelNumber string `json:"ModelNumber"`
|
||||||
|
SerialNumber string `json:"SerialNumber"`
|
||||||
|
Firmware string `json:"Firmware"`
|
||||||
|
PhysicalSize int64 `json:"PhysicalSize"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func discoverStorageDevices() []lsblkDevice {
|
||||||
|
merged := map[string]lsblkDevice{}
|
||||||
|
for _, dev := range lsblkDevices() {
|
||||||
|
if dev.Name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
merged[dev.Name] = dev
|
||||||
|
}
|
||||||
|
for _, dev := range nvmeListDevices() {
|
||||||
|
if dev.Name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
current := merged[dev.Name]
|
||||||
|
merged[dev.Name] = mergeStorageDevice(current, dev)
|
||||||
|
}
|
||||||
|
|
||||||
|
disks := make([]lsblkDevice, 0, len(merged))
|
||||||
|
for _, dev := range merged {
|
||||||
|
if dev.Type == "" {
|
||||||
|
dev.Type = "disk"
|
||||||
|
}
|
||||||
|
if dev.Type != "disk" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
disks = append(disks, dev)
|
||||||
|
}
|
||||||
|
return disks
|
||||||
|
}
|
||||||
|
|
||||||
func lsblkDevices() []lsblkDevice {
|
func lsblkDevices() []lsblkDevice {
|
||||||
out, err := exec.Command("lsblk", "-J", "-d",
|
out, err := exec.Command("lsblk", "-J", "-d",
|
||||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||||
@@ -60,6 +103,59 @@ func lsblkDevices() []lsblkDevice {
|
|||||||
return disks
|
return disks
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func nvmeListDevices() []lsblkDevice {
|
||||||
|
out, err := exec.Command("nvme", "list", "-o", "json").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var root nvmeListRoot
|
||||||
|
if err := json.Unmarshal(out, &root); err != nil {
|
||||||
|
slog.Warn("storage: nvme list parse failed", "err", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
devices := make([]lsblkDevice, 0, len(root.Devices))
|
||||||
|
for _, dev := range root.Devices {
|
||||||
|
name := filepath.Base(strings.TrimSpace(dev.DevicePath))
|
||||||
|
if name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devices = append(devices, lsblkDevice{
|
||||||
|
Name: name,
|
||||||
|
Type: "disk",
|
||||||
|
Size: strconv.FormatInt(dev.PhysicalSize, 10),
|
||||||
|
Serial: strings.TrimSpace(dev.SerialNumber),
|
||||||
|
Model: strings.TrimSpace(dev.ModelNumber),
|
||||||
|
Tran: "nvme",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return devices
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeStorageDevice(existing, incoming lsblkDevice) lsblkDevice {
|
||||||
|
if existing.Name == "" {
|
||||||
|
return incoming
|
||||||
|
}
|
||||||
|
if existing.Type == "" {
|
||||||
|
existing.Type = incoming.Type
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(existing.Size) == "" {
|
||||||
|
existing.Size = incoming.Size
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(existing.Serial) == "" {
|
||||||
|
existing.Serial = incoming.Serial
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(existing.Model) == "" {
|
||||||
|
existing.Model = incoming.Model
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(existing.Tran) == "" {
|
||||||
|
existing.Tran = incoming.Tran
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(existing.Hctl) == "" {
|
||||||
|
existing.Hctl = incoming.Hctl
|
||||||
|
}
|
||||||
|
return existing
|
||||||
|
}
|
||||||
|
|
||||||
// smartctlInfo is the subset of smartctl -j -a output we care about.
|
// smartctlInfo is the subset of smartctl -j -a output we care about.
|
||||||
type smartctlInfo struct {
|
type smartctlInfo struct {
|
||||||
ModelFamily string `json:"model_family"`
|
ModelFamily string `json:"model_family"`
|
||||||
@@ -67,14 +163,22 @@ type smartctlInfo struct {
|
|||||||
SerialNumber string `json:"serial_number"`
|
SerialNumber string `json:"serial_number"`
|
||||||
FirmwareVer string `json:"firmware_version"`
|
FirmwareVer string `json:"firmware_version"`
|
||||||
RotationRate int `json:"rotation_rate"`
|
RotationRate int `json:"rotation_rate"`
|
||||||
|
Temperature struct {
|
||||||
|
Current int `json:"current"`
|
||||||
|
} `json:"temperature"`
|
||||||
|
SmartStatus struct {
|
||||||
|
Passed bool `json:"passed"`
|
||||||
|
} `json:"smart_status"`
|
||||||
UserCapacity struct {
|
UserCapacity struct {
|
||||||
Bytes int64 `json:"bytes"`
|
Bytes int64 `json:"bytes"`
|
||||||
} `json:"user_capacity"`
|
} `json:"user_capacity"`
|
||||||
AtaSmartAttributes struct {
|
AtaSmartAttributes struct {
|
||||||
Table []struct {
|
Table []struct {
|
||||||
ID int `json:"id"`
|
ID int `json:"id"`
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Raw struct{ Value int64 `json:"value"` } `json:"raw"`
|
Raw struct {
|
||||||
|
Value int64 `json:"value"`
|
||||||
|
} `json:"raw"`
|
||||||
} `json:"table"`
|
} `json:"table"`
|
||||||
} `json:"ata_smart_attributes"`
|
} `json:"ata_smart_attributes"`
|
||||||
PowerOnTime struct {
|
PowerOnTime struct {
|
||||||
@@ -86,6 +190,7 @@ type smartctlInfo struct {
|
|||||||
func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||||
present := true
|
present := true
|
||||||
s := schema.HardwareStorage{Present: &present}
|
s := schema.HardwareStorage{Present: &present}
|
||||||
|
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
||||||
|
|
||||||
tran := strings.ToLower(dev.Tran)
|
tran := strings.ToLower(dev.Tran)
|
||||||
devPath := "/dev/" + dev.Name
|
devPath := "/dev/" + dev.Name
|
||||||
@@ -149,69 +254,117 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
} else if info.RotationRate > 0 {
|
} else if info.RotationRate > 0 {
|
||||||
devType = "HDD"
|
devType = "HDD"
|
||||||
}
|
}
|
||||||
|
s.Type = &devType
|
||||||
|
|
||||||
// telemetry
|
if info.Temperature.Current > 0 {
|
||||||
tel := map[string]any{}
|
t := float64(info.Temperature.Current)
|
||||||
|
s.TemperatureC = &t
|
||||||
|
}
|
||||||
if info.PowerOnTime.Hours > 0 {
|
if info.PowerOnTime.Hours > 0 {
|
||||||
tel["power_on_hours"] = info.PowerOnTime.Hours
|
v := int64(info.PowerOnTime.Hours)
|
||||||
|
s.PowerOnHours = &v
|
||||||
}
|
}
|
||||||
if info.PowerCycleCount > 0 {
|
if info.PowerCycleCount > 0 {
|
||||||
tel["power_cycles"] = info.PowerCycleCount
|
v := int64(info.PowerCycleCount)
|
||||||
|
s.PowerCycles = &v
|
||||||
}
|
}
|
||||||
|
reallocated := int64(0)
|
||||||
|
pending := int64(0)
|
||||||
|
uncorrectable := int64(0)
|
||||||
|
lifeRemaining := int64(0)
|
||||||
for _, attr := range info.AtaSmartAttributes.Table {
|
for _, attr := range info.AtaSmartAttributes.Table {
|
||||||
switch attr.ID {
|
switch attr.ID {
|
||||||
case 5:
|
case 5:
|
||||||
tel["reallocated_sectors"] = attr.Raw.Value
|
reallocated = attr.Raw.Value
|
||||||
|
s.ReallocatedSectors = &reallocated
|
||||||
case 177:
|
case 177:
|
||||||
tel["wear_leveling_pct"] = attr.Raw.Value
|
value := float64(attr.Raw.Value)
|
||||||
|
s.LifeUsedPct = &value
|
||||||
case 231:
|
case 231:
|
||||||
tel["life_remaining_pct"] = attr.Raw.Value
|
lifeRemaining = attr.Raw.Value
|
||||||
|
value := float64(attr.Raw.Value)
|
||||||
|
s.LifeRemainingPct = &value
|
||||||
case 241:
|
case 241:
|
||||||
tel["total_lba_written"] = attr.Raw.Value
|
value := attr.Raw.Value
|
||||||
|
s.WrittenBytes = &value
|
||||||
|
case 197:
|
||||||
|
pending = attr.Raw.Value
|
||||||
|
s.CurrentPendingSectors = &pending
|
||||||
|
case 198:
|
||||||
|
uncorrectable = attr.Raw.Value
|
||||||
|
s.OfflineUncorrectable = &uncorrectable
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(tel) > 0 {
|
|
||||||
s.Telemetry = tel
|
status := storageHealthStatus{
|
||||||
|
overallPassed: info.SmartStatus.Passed,
|
||||||
|
hasOverall: true,
|
||||||
|
reallocatedSectors: reallocated,
|
||||||
|
pendingSectors: pending,
|
||||||
|
offlineUncorrectable: uncorrectable,
|
||||||
|
lifeRemainingPct: lifeRemaining,
|
||||||
}
|
}
|
||||||
|
setStorageHealthStatus(&s, status)
|
||||||
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
s.Type = &devType
|
s.Type = &devType
|
||||||
status := "OK"
|
status := statusUnknown
|
||||||
s.Status = &status
|
s.Status = &status
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
||||||
type nvmeSmartLog struct {
|
type nvmeSmartLog struct {
|
||||||
|
CriticalWarning int `json:"critical_warning"`
|
||||||
PercentageUsed int `json:"percentage_used"`
|
PercentageUsed int `json:"percentage_used"`
|
||||||
|
AvailableSpare int `json:"available_spare"`
|
||||||
|
SpareThreshold int `json:"spare_thresh"`
|
||||||
|
Temperature int64 `json:"temperature"`
|
||||||
PowerOnHours int64 `json:"power_on_hours"`
|
PowerOnHours int64 `json:"power_on_hours"`
|
||||||
PowerCycles int64 `json:"power_cycles"`
|
PowerCycles int64 `json:"power_cycles"`
|
||||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
||||||
|
DataUnitsRead int64 `json:"data_units_read"`
|
||||||
DataUnitsWritten int64 `json:"data_units_written"`
|
DataUnitsWritten int64 `json:"data_units_written"`
|
||||||
ControllerBusy int64 `json:"controller_busy_time"`
|
ControllerBusy int64 `json:"controller_busy_time"`
|
||||||
|
MediaErrors int64 `json:"media_errors"`
|
||||||
|
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
||||||
type nvmeIDCtrl struct {
|
type nvmeIDCtrl struct {
|
||||||
ModelNumber string `json:"mn"`
|
ModelNumber string `json:"mn"`
|
||||||
SerialNumber string `json:"sn"`
|
SerialNumber string `json:"sn"`
|
||||||
FirmwareRev string `json:"fr"`
|
FirmwareRev string `json:"fr"`
|
||||||
TotalCapacity int64 `json:"tnvmcap"`
|
TotalCapacity int64 `json:"tnvmcap"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||||
present := true
|
present := true
|
||||||
devType := "NVMe"
|
devType := "NVMe"
|
||||||
iface := "NVMe"
|
iface := "NVMe"
|
||||||
status := "OK"
|
status := statusOK
|
||||||
s := schema.HardwareStorage{
|
s := schema.HardwareStorage{
|
||||||
Present: &present,
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
Type: &devType,
|
Present: &present,
|
||||||
Interface: &iface,
|
Type: &devType,
|
||||||
Status: &status,
|
Interface: &iface,
|
||||||
|
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
||||||
}
|
}
|
||||||
|
|
||||||
devPath := "/dev/" + dev.Name
|
devPath := "/dev/" + dev.Name
|
||||||
|
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
|
||||||
|
s.Model = &v
|
||||||
|
}
|
||||||
|
if v := cleanDMIValue(strings.TrimSpace(dev.Serial)); v != "" {
|
||||||
|
s.SerialNumber = &v
|
||||||
|
}
|
||||||
|
if size := parseStorageBytes(dev.Size); size > 0 {
|
||||||
|
gb := int(size / 1_000_000_000)
|
||||||
|
if gb > 0 {
|
||||||
|
s.SizeGB = &gb
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// id-ctrl: model, serial, firmware, capacity
|
// id-ctrl: model, serial, firmware, capacity
|
||||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||||
@@ -237,30 +390,131 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
||||||
var log nvmeSmartLog
|
var log nvmeSmartLog
|
||||||
if json.Unmarshal(out, &log) == nil {
|
if json.Unmarshal(out, &log) == nil {
|
||||||
tel := map[string]any{}
|
|
||||||
if log.PowerOnHours > 0 {
|
if log.PowerOnHours > 0 {
|
||||||
tel["power_on_hours"] = log.PowerOnHours
|
s.PowerOnHours = &log.PowerOnHours
|
||||||
}
|
}
|
||||||
if log.PowerCycles > 0 {
|
if log.PowerCycles > 0 {
|
||||||
tel["power_cycles"] = log.PowerCycles
|
s.PowerCycles = &log.PowerCycles
|
||||||
}
|
}
|
||||||
if log.UnsafeShutdowns > 0 {
|
if log.UnsafeShutdowns > 0 {
|
||||||
tel["unsafe_shutdowns"] = log.UnsafeShutdowns
|
s.UnsafeShutdowns = &log.UnsafeShutdowns
|
||||||
}
|
}
|
||||||
if log.PercentageUsed > 0 {
|
if log.PercentageUsed > 0 {
|
||||||
tel["percentage_used"] = log.PercentageUsed
|
v := float64(log.PercentageUsed)
|
||||||
|
s.LifeUsedPct = &v
|
||||||
|
remaining := 100 - v
|
||||||
|
s.LifeRemainingPct = &remaining
|
||||||
}
|
}
|
||||||
if log.DataUnitsWritten > 0 {
|
if log.DataUnitsWritten > 0 {
|
||||||
tel["data_units_written"] = log.DataUnitsWritten
|
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
|
||||||
|
s.WrittenBytes = &v
|
||||||
}
|
}
|
||||||
if log.ControllerBusy > 0 {
|
if log.DataUnitsRead > 0 {
|
||||||
tel["controller_busy_time"] = log.ControllerBusy
|
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
|
||||||
|
s.ReadBytes = &v
|
||||||
}
|
}
|
||||||
if len(tel) > 0 {
|
if log.AvailableSpare > 0 {
|
||||||
s.Telemetry = tel
|
v := float64(log.AvailableSpare)
|
||||||
|
s.AvailableSparePct = &v
|
||||||
}
|
}
|
||||||
|
if log.MediaErrors > 0 {
|
||||||
|
s.MediaErrors = &log.MediaErrors
|
||||||
|
}
|
||||||
|
if log.NumErrLogEntries > 0 {
|
||||||
|
s.ErrorLogEntries = &log.NumErrLogEntries
|
||||||
|
}
|
||||||
|
if log.Temperature > 0 {
|
||||||
|
v := float64(log.Temperature - 273)
|
||||||
|
s.TemperatureC = &v
|
||||||
|
}
|
||||||
|
setStorageHealthStatus(&s, storageHealthStatus{
|
||||||
|
criticalWarning: log.CriticalWarning,
|
||||||
|
percentageUsed: int64(log.PercentageUsed),
|
||||||
|
availableSpare: int64(log.AvailableSpare),
|
||||||
|
spareThreshold: int64(log.SpareThreshold),
|
||||||
|
unsafeShutdowns: log.UnsafeShutdowns,
|
||||||
|
mediaErrors: log.MediaErrors,
|
||||||
|
errorLogEntries: log.NumErrLogEntries,
|
||||||
|
})
|
||||||
|
return s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
status = statusUnknown
|
||||||
|
s.Status = &status
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseStorageBytes(raw string) int64 {
|
||||||
|
value, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
|
||||||
|
if err == nil && value > 0 {
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvmeDataUnitsToBytes(units int64) int64 {
|
||||||
|
if units <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return units * 512000
|
||||||
|
}
|
||||||
|
|
||||||
|
type storageHealthStatus struct {
|
||||||
|
hasOverall bool
|
||||||
|
overallPassed bool
|
||||||
|
reallocatedSectors int64
|
||||||
|
pendingSectors int64
|
||||||
|
offlineUncorrectable int64
|
||||||
|
lifeRemainingPct int64
|
||||||
|
criticalWarning int
|
||||||
|
percentageUsed int64
|
||||||
|
availableSpare int64
|
||||||
|
spareThreshold int64
|
||||||
|
unsafeShutdowns int64
|
||||||
|
mediaErrors int64
|
||||||
|
errorLogEntries int64
|
||||||
|
}
|
||||||
|
|
||||||
|
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
|
||||||
|
status := statusOK
|
||||||
|
var description *string
|
||||||
|
switch {
|
||||||
|
case health.hasOverall && !health.overallPassed:
|
||||||
|
status = statusCritical
|
||||||
|
description = stringPtr("SMART overall self-assessment failed")
|
||||||
|
case health.criticalWarning > 0:
|
||||||
|
status = statusCritical
|
||||||
|
description = stringPtr("NVMe critical warning is set")
|
||||||
|
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
|
||||||
|
status = statusCritical
|
||||||
|
description = stringPtr("Pending or offline uncorrectable sectors detected")
|
||||||
|
case health.mediaErrors > 0:
|
||||||
|
status = statusWarning
|
||||||
|
description = stringPtr("Media errors reported")
|
||||||
|
case health.reallocatedSectors > 0:
|
||||||
|
status = statusWarning
|
||||||
|
description = stringPtr("Reallocated sectors detected")
|
||||||
|
case health.errorLogEntries > 0:
|
||||||
|
status = statusWarning
|
||||||
|
description = stringPtr("Device error log contains entries")
|
||||||
|
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
|
||||||
|
status = statusWarning
|
||||||
|
description = stringPtr("Life remaining is low")
|
||||||
|
case health.percentageUsed >= 95:
|
||||||
|
status = statusWarning
|
||||||
|
description = stringPtr("Drive wear level is high")
|
||||||
|
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
|
||||||
|
status = statusWarning
|
||||||
|
description = stringPtr("Available spare is at or below threshold")
|
||||||
|
case health.unsafeShutdowns > 100:
|
||||||
|
status = statusWarning
|
||||||
|
description = stringPtr("Unsafe shutdown count is high")
|
||||||
|
}
|
||||||
|
s.Status = &status
|
||||||
|
s.ErrorDescription = description
|
||||||
|
}
|
||||||
|
|
||||||
|
func stringPtr(value string) *string {
|
||||||
|
return &value
|
||||||
|
}
|
||||||
|
|||||||
33
audit/internal/collector/storage_discovery_test.go
Normal file
33
audit/internal/collector/storage_discovery_test.go
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := mergeStorageDevice(
|
||||||
|
lsblkDevice{Name: "nvme0n1", Type: "disk", Tran: "nvme"},
|
||||||
|
lsblkDevice{Name: "nvme0n1", Type: "disk", Size: "1024", Serial: "SN123", Model: "Kioxia"},
|
||||||
|
)
|
||||||
|
|
||||||
|
if got.Serial != "SN123" {
|
||||||
|
t.Fatalf("serial=%q want SN123", got.Serial)
|
||||||
|
}
|
||||||
|
if got.Model != "Kioxia" {
|
||||||
|
t.Fatalf("model=%q want Kioxia", got.Model)
|
||||||
|
}
|
||||||
|
if got.Size != "1024" {
|
||||||
|
t.Fatalf("size=%q want 1024", got.Size)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseStorageBytes(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
if got := parseStorageBytes(" 2048 "); got != 2048 {
|
||||||
|
t.Fatalf("parseStorageBytes=%d want 2048", got)
|
||||||
|
}
|
||||||
|
if got := parseStorageBytes("1.92 TB"); got != 0 {
|
||||||
|
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
63
audit/internal/collector/storage_health_test.go
Normal file
63
audit/internal/collector/storage_health_test.go
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSetStorageHealthStatus(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
health storageHealthStatus
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "smart overall failed",
|
||||||
|
health: storageHealthStatus{hasOverall: true, overallPassed: false},
|
||||||
|
want: statusCritical,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "nvme critical warning",
|
||||||
|
health: storageHealthStatus{criticalWarning: 1},
|
||||||
|
want: statusCritical,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "pending sectors",
|
||||||
|
health: storageHealthStatus{pendingSectors: 1},
|
||||||
|
want: statusCritical,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "media errors warning",
|
||||||
|
health: storageHealthStatus{mediaErrors: 2},
|
||||||
|
want: statusWarning,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "reallocated warning",
|
||||||
|
health: storageHealthStatus{reallocatedSectors: 1},
|
||||||
|
want: statusWarning,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "life remaining low",
|
||||||
|
health: storageHealthStatus{lifeRemainingPct: 8},
|
||||||
|
want: statusWarning,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "healthy",
|
||||||
|
health: storageHealthStatus{},
|
||||||
|
want: statusOK,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
var disk schema.HardwareStorage
|
||||||
|
setStorageHealthStatus(&disk, tt.health)
|
||||||
|
if disk.Status == nil || *disk.Status != tt.want {
|
||||||
|
t.Fatalf("status=%v want %q", disk.Status, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
114
audit/internal/collector/summary.go
Normal file
114
audit/internal/collector/summary.go
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BuildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary {
|
||||||
|
summary := &schema.HardwareHealthSummary{
|
||||||
|
Status: statusOK,
|
||||||
|
CollectedAt: time.Now().UTC().Format(time.RFC3339),
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, dimm := range snap.Memory {
|
||||||
|
switch derefString(dimm.Status) {
|
||||||
|
case statusWarning:
|
||||||
|
summary.MemoryWarn++
|
||||||
|
summary.Warnings = append(summary.Warnings, formatMemorySummary(dimm))
|
||||||
|
case statusCritical:
|
||||||
|
summary.MemoryFail++
|
||||||
|
summary.Failures = append(summary.Failures, formatMemorySummary(dimm))
|
||||||
|
case statusEmpty:
|
||||||
|
summary.EmptyDIMMs++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, disk := range snap.Storage {
|
||||||
|
switch derefString(disk.Status) {
|
||||||
|
case statusWarning:
|
||||||
|
summary.StorageWarn++
|
||||||
|
summary.Warnings = append(summary.Warnings, formatStorageSummary(disk))
|
||||||
|
case statusCritical:
|
||||||
|
summary.StorageFail++
|
||||||
|
summary.Failures = append(summary.Failures, formatStorageSummary(disk))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, dev := range snap.PCIeDevices {
|
||||||
|
switch derefString(dev.Status) {
|
||||||
|
case statusWarning:
|
||||||
|
summary.PCIeWarn++
|
||||||
|
summary.Warnings = append(summary.Warnings, formatPCIeSummary(dev))
|
||||||
|
case statusCritical:
|
||||||
|
summary.PCIeFail++
|
||||||
|
summary.Failures = append(summary.Failures, formatPCIeSummary(dev))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, psu := range snap.PowerSupplies {
|
||||||
|
if psu.Present != nil && !*psu.Present {
|
||||||
|
summary.MissingPSUs++
|
||||||
|
}
|
||||||
|
switch derefString(psu.Status) {
|
||||||
|
case statusWarning:
|
||||||
|
summary.PSUWarn++
|
||||||
|
summary.Warnings = append(summary.Warnings, formatPSUSummary(psu))
|
||||||
|
case statusCritical:
|
||||||
|
summary.PSUFail++
|
||||||
|
summary.Failures = append(summary.Failures, formatPSUSummary(psu))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(summary.Failures) > 0 || summary.StorageFail > 0 || summary.PCIeFail > 0 || summary.PSUFail > 0 || summary.MemoryFail > 0 {
|
||||||
|
summary.Status = statusCritical
|
||||||
|
} else if len(summary.Warnings) > 0 || summary.StorageWarn > 0 || summary.PCIeWarn > 0 || summary.PSUWarn > 0 || summary.MemoryWarn > 0 {
|
||||||
|
summary.Status = statusWarning
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(summary.Warnings) == 0 {
|
||||||
|
summary.Warnings = nil
|
||||||
|
}
|
||||||
|
if len(summary.Failures) == 0 {
|
||||||
|
summary.Failures = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return summary
|
||||||
|
}
|
||||||
|
|
||||||
|
func derefString(value *string) string {
|
||||||
|
if value == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return *value
|
||||||
|
}
|
||||||
|
|
||||||
|
func preferredName(model, serial, slot *string) string {
|
||||||
|
switch {
|
||||||
|
case model != nil && *model != "":
|
||||||
|
return *model
|
||||||
|
case serial != nil && *serial != "":
|
||||||
|
return *serial
|
||||||
|
case slot != nil && *slot != "":
|
||||||
|
return *slot
|
||||||
|
default:
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatStorageSummary(disk schema.HardwareStorage) string {
|
||||||
|
return fmt.Sprintf("storage %s status=%s", preferredName(disk.Model, disk.SerialNumber, disk.Slot), derefString(disk.Status))
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatPCIeSummary(dev schema.HardwarePCIeDevice) string {
|
||||||
|
return fmt.Sprintf("pcie %s status=%s", preferredName(dev.Model, dev.SerialNumber, dev.BDF), derefString(dev.Status))
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatPSUSummary(psu schema.HardwarePowerSupply) string {
|
||||||
|
return fmt.Sprintf("psu %s status=%s", preferredName(psu.Model, psu.SerialNumber, psu.Slot), derefString(psu.Status))
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatMemorySummary(dimm schema.HardwareMemory) string {
|
||||||
|
return fmt.Sprintf("memory %s status=%s", preferredName(dimm.PartNumber, dimm.SerialNumber, dimm.Slot), derefString(dimm.Status))
|
||||||
|
}
|
||||||
@@ -31,7 +31,7 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
|
|||||||
func TestHasVROCController(t *testing.T) {
|
func TestHasVROCController(t *testing.T) {
|
||||||
intel := vendorIntel
|
intel := vendorIntel
|
||||||
model := "Volume Management Device NVMe RAID Controller"
|
model := "Volume Management Device NVMe RAID Controller"
|
||||||
class := "RAID bus controller"
|
class := "MassStorageController"
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
pcie []schema.HardwarePCIeDevice
|
pcie []schema.HardwarePCIeDevice
|
||||||
|
|||||||
153
audit/internal/platform/export.go
Normal file
153
audit/internal/platform/export.go
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var exportExecCommand = exec.Command
|
||||||
|
|
||||||
|
func formatMountTargetError(target RemovableTarget, raw string, err error) error {
|
||||||
|
msg := strings.TrimSpace(raw)
|
||||||
|
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||||
|
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||||
|
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||||
|
}
|
||||||
|
if msg == "" {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%s: %w", msg, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
func removableTargetReadOnly(fields map[string]string) bool {
|
||||||
|
if fields["RO"] == "1" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
|
||||||
|
case "iso9660", "squashfs":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureWritableMountpoint(mountpoint string) error {
|
||||||
|
probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||||
|
}
|
||||||
|
name := probe.Name()
|
||||||
|
if closeErr := probe.Close(); closeErr != nil {
|
||||||
|
_ = os.Remove(name)
|
||||||
|
return closeErr
|
||||||
|
}
|
||||||
|
if err := os.Remove(name); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||||
|
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var out []RemovableTarget
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
if strings.TrimSpace(line) == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fields := parseLSBLKPairs(line)
|
||||||
|
deviceType := fields["TYPE"]
|
||||||
|
if deviceType == "rom" || deviceType == "loop" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
removable := fields["RM"] == "1"
|
||||||
|
if !removable {
|
||||||
|
if parent := fields["PKNAME"]; parent != "" {
|
||||||
|
if data, err := os.ReadFile(filepath.Join("/sys/class/block", parent, "removable")); err == nil {
|
||||||
|
removable = strings.TrimSpace(string(data)) == "1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, RemovableTarget{
|
||||||
|
Device: "/dev/" + fields["NAME"],
|
||||||
|
FSType: fields["FSTYPE"],
|
||||||
|
Size: fields["SIZE"],
|
||||||
|
Label: fields["LABEL"],
|
||||||
|
Model: fields["MODEL"],
|
||||||
|
Mountpoint: fields["MOUNTPOINT"],
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Slice(out, func(i, j int) bool { return out[i].Device < out[j].Device })
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
|
||||||
|
if src == "" || target.Device == "" {
|
||||||
|
return "", fmt.Errorf("source and target are required")
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(src); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
mountpoint := strings.TrimSpace(target.Mountpoint)
|
||||||
|
mountedHere := false
|
||||||
|
mounted := mountpoint != ""
|
||||||
|
if mountpoint == "" {
|
||||||
|
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
|
||||||
|
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||||
|
_ = os.Remove(mountpoint)
|
||||||
|
return "", formatMountTargetError(target, string(raw), err)
|
||||||
|
}
|
||||||
|
mountedHere = true
|
||||||
|
mounted = true
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if !mounted {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = exportExecCommand("sync").Run()
|
||||||
|
if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
|
||||||
|
msg := strings.TrimSpace(string(raw))
|
||||||
|
if msg == "" {
|
||||||
|
retErr = err
|
||||||
|
} else {
|
||||||
|
retErr = fmt.Errorf("%s: %w", msg, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if mountedHere {
|
||||||
|
_ = os.Remove(mountpoint)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if err := ensureWritableMountpoint(mountpoint); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
filename := filepath.Base(src)
|
||||||
|
dst = filepath.Join(mountpoint, filename)
|
||||||
|
data, err := os.ReadFile(src)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(dst, data, 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return dst, nil
|
||||||
|
}
|
||||||
112
audit/internal/platform/export_test.go
Normal file
112
audit/internal/platform/export_test.go
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||||
|
mountpoint := filepath.Join(tmp, "mnt")
|
||||||
|
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir mountpoint: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||||
|
t.Fatalf("write src: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var calls [][]string
|
||||||
|
oldExec := exportExecCommand
|
||||||
|
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
calls = append(calls, append([]string{name}, args...))
|
||||||
|
return exec.Command("sh", "-c", "exit 0")
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||||
|
|
||||||
|
s := &System{}
|
||||||
|
dst, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||||
|
Device: "/dev/sdb1",
|
||||||
|
Mountpoint: mountpoint,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExportFileToTarget error: %v", err)
|
||||||
|
}
|
||||||
|
if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
|
||||||
|
t.Fatalf("dst=%q want %q", got, want)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
|
||||||
|
t.Fatalf("exported file missing: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
foundUmount := false
|
||||||
|
for _, call := range calls {
|
||||||
|
if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
|
||||||
|
foundUmount = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !foundUmount {
|
||||||
|
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||||
|
mountpoint := filepath.Join(tmp, "mnt")
|
||||||
|
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir mountpoint: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||||
|
t.Fatalf("write src: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.Chmod(mountpoint, 0555); err != nil {
|
||||||
|
t.Fatalf("chmod mountpoint: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
oldExec := exportExecCommand
|
||||||
|
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
return exec.Command("sh", "-c", "exit 0")
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||||
|
|
||||||
|
s := &System{}
|
||||||
|
_, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||||
|
Device: "/dev/sdb1",
|
||||||
|
Mountpoint: mountpoint,
|
||||||
|
})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error for non-writable mountpoint")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "target filesystem is not writable") {
|
||||||
|
t.Fatalf("err=%q want writable message", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
|
||||||
|
oldExec := exportExecCommand
|
||||||
|
lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
|
||||||
|
NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
|
||||||
|
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
|
||||||
|
cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||||
|
|
||||||
|
s := &System{}
|
||||||
|
targets, err := s.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ListRemovableTargets error: %v", err)
|
||||||
|
}
|
||||||
|
if len(targets) != 1 {
|
||||||
|
t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
|
||||||
|
}
|
||||||
|
if got := targets[0].Device; got != "/dev/sdb1" {
|
||||||
|
t.Fatalf("device=%q want /dev/sdb1", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
738
audit/internal/platform/gpu_metrics.go
Normal file
738
audit/internal/platform/gpu_metrics.go
Normal file
@@ -0,0 +1,738 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||||
|
type GPUMetricRow struct {
|
||||||
|
ElapsedSec float64
|
||||||
|
GPUIndex int
|
||||||
|
TempC float64
|
||||||
|
UsagePct float64
|
||||||
|
PowerW float64
|
||||||
|
ClockMHz float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||||
|
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||||
|
args := []string{
|
||||||
|
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics",
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
}
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
ids := make([]string, len(gpuIndices))
|
||||||
|
for i, idx := range gpuIndices {
|
||||||
|
ids[i] = strconv.Itoa(idx)
|
||||||
|
}
|
||||||
|
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
||||||
|
}
|
||||||
|
out, err := exec.Command("nvidia-smi", args...).Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var rows []GPUMetricRow
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ", ")
|
||||||
|
if len(parts) < 5 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
rows = append(rows, GPUMetricRow{
|
||||||
|
GPUIndex: idx,
|
||||||
|
TempC: parseGPUFloat(parts[1]),
|
||||||
|
UsagePct: parseGPUFloat(parts[2]),
|
||||||
|
PowerW: parseGPUFloat(parts[3]),
|
||||||
|
ClockMHz: parseGPUFloat(parts[4]),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return rows, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseGPUFloat(s string) float64 {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if s == "N/A" || s == "[Not Supported]" || s == "" {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
v, _ := strconv.ParseFloat(s, 64)
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||||
|
func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||||
|
return sampleGPUMetrics(gpuIndices)
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||||
|
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||||
|
var b bytes.Buffer
|
||||||
|
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
|
||||||
|
for _, r := range rows {
|
||||||
|
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
|
||||||
|
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
|
||||||
|
}
|
||||||
|
return os.WriteFile(path, b.Bytes(), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
||||||
|
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||||
|
// Group by GPU index preserving order.
|
||||||
|
seen := make(map[int]bool)
|
||||||
|
var order []int
|
||||||
|
gpuMap := make(map[int][]GPUMetricRow)
|
||||||
|
for _, r := range rows {
|
||||||
|
if !seen[r.GPUIndex] {
|
||||||
|
seen[r.GPUIndex] = true
|
||||||
|
order = append(order, r.GPUIndex)
|
||||||
|
}
|
||||||
|
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||||
|
}
|
||||||
|
|
||||||
|
var svgs strings.Builder
|
||||||
|
for _, gpuIdx := range order {
|
||||||
|
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
|
||||||
|
svgs.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
ts := time.Now().UTC().Format("2006-01-02 15:04:05 UTC")
|
||||||
|
html := fmt.Sprintf(`<!DOCTYPE html>
|
||||||
|
<html><head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>GPU Stress Test Metrics</title>
|
||||||
|
<style>
|
||||||
|
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
|
||||||
|
h1 { text-align: center; color: #333; margin: 0 0 8px; }
|
||||||
|
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
|
||||||
|
</style>
|
||||||
|
</head><body>
|
||||||
|
<h1>GPU Stress Test Metrics</h1>
|
||||||
|
<p>Generated %s</p>
|
||||||
|
%s
|
||||||
|
</body></html>`, ts, svgs.String())
|
||||||
|
|
||||||
|
return os.WriteFile(path, []byte(html), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
||||||
|
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||||
|
// Layout
|
||||||
|
const W, H = 960, 520
|
||||||
|
const plotX1 = 120 // usage axis / chart left border
|
||||||
|
const plotX2 = 840 // power axis / chart right border
|
||||||
|
const plotY1 = 70 // top
|
||||||
|
const plotY2 = 465 // bottom (PH = 395)
|
||||||
|
const PW = plotX2 - plotX1
|
||||||
|
const PH = plotY2 - plotY1
|
||||||
|
// Outer axes
|
||||||
|
const tempAxisX = 60 // temp axis line
|
||||||
|
const clockAxisX = 900 // clock axis line
|
||||||
|
|
||||||
|
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
||||||
|
seriesLabel := [4]string{
|
||||||
|
fmt.Sprintf("GPU %d Temp (°C)", gpuIdx),
|
||||||
|
fmt.Sprintf("GPU %d Usage (%%)", gpuIdx),
|
||||||
|
fmt.Sprintf("GPU %d Power (W)", gpuIdx),
|
||||||
|
fmt.Sprintf("GPU %d Clock (MHz)", gpuIdx),
|
||||||
|
}
|
||||||
|
axisLabel := [4]string{"Temperature (°C)", "GPU Usage (%)", "Power (W)", "Clock (MHz)"}
|
||||||
|
|
||||||
|
// Extract series
|
||||||
|
t := make([]float64, len(rows))
|
||||||
|
vals := [4][]float64{}
|
||||||
|
for i := range vals {
|
||||||
|
vals[i] = make([]float64, len(rows))
|
||||||
|
}
|
||||||
|
for i, r := range rows {
|
||||||
|
t[i] = r.ElapsedSec
|
||||||
|
vals[0][i] = r.TempC
|
||||||
|
vals[1][i] = r.UsagePct
|
||||||
|
vals[2][i] = r.PowerW
|
||||||
|
vals[3][i] = r.ClockMHz
|
||||||
|
}
|
||||||
|
|
||||||
|
tMin, tMax := gpuMinMax(t)
|
||||||
|
type axisScale struct {
|
||||||
|
ticks []float64
|
||||||
|
min, max float64
|
||||||
|
}
|
||||||
|
var axes [4]axisScale
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
mn, mx := gpuMinMax(vals[i])
|
||||||
|
tks := gpuNiceTicks(mn, mx, 8)
|
||||||
|
axes[i] = axisScale{ticks: tks, min: tks[0], max: tks[len(tks)-1]}
|
||||||
|
}
|
||||||
|
|
||||||
|
xv := func(tv float64) float64 {
|
||||||
|
if tMax == tMin {
|
||||||
|
return float64(plotX1)
|
||||||
|
}
|
||||||
|
return float64(plotX1) + (tv-tMin)/(tMax-tMin)*float64(PW)
|
||||||
|
}
|
||||||
|
yv := func(v float64, ai int) float64 {
|
||||||
|
a := axes[ai]
|
||||||
|
if a.max == a.min {
|
||||||
|
return float64(plotY1 + PH/2)
|
||||||
|
}
|
||||||
|
return float64(plotY2) - (v-a.min)/(a.max-a.min)*float64(PH)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d"`+
|
||||||
|
` style="background:#fff;border-radius:8px;display:block;margin:0 auto 24px;`+
|
||||||
|
`box-shadow:0 2px 12px rgba(0,0,0,.12)">`+"\n", W, H)
|
||||||
|
|
||||||
|
// Title
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="22" text-anchor="middle" font-family="sans-serif"`+
|
||||||
|
` font-size="14" font-weight="bold" fill="#333">GPU Stress Test Metrics — GPU %d</text>`+"\n",
|
||||||
|
plotX1+PW/2, gpuIdx)
|
||||||
|
|
||||||
|
// Horizontal grid (align to temp axis ticks)
|
||||||
|
b.WriteString(`<g stroke="#e0e0e0" stroke-width="0.5">` + "\n")
|
||||||
|
for _, tick := range axes[0].ticks {
|
||||||
|
y := yv(tick, 0)
|
||||||
|
if y < float64(plotY1) || y > float64(plotY2) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
|
||||||
|
plotX1, y, plotX2, y)
|
||||||
|
}
|
||||||
|
// Vertical grid
|
||||||
|
xTicks := gpuNiceTicks(tMin, tMax, 10)
|
||||||
|
for _, tv := range xTicks {
|
||||||
|
x := xv(tv)
|
||||||
|
if x < float64(plotX1) || x > float64(plotX2) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
|
||||||
|
x, plotY1, x, plotY2)
|
||||||
|
}
|
||||||
|
b.WriteString("</g>\n")
|
||||||
|
|
||||||
|
// Chart border
|
||||||
|
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
||||||
|
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
||||||
|
plotX1, plotY1, PW, PH)
|
||||||
|
|
||||||
|
// X axis ticks and labels
|
||||||
|
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#333" text-anchor="middle">` + "\n")
|
||||||
|
for _, tv := range xTicks {
|
||||||
|
x := xv(tv)
|
||||||
|
if x < float64(plotX1) || x > float64(plotX2) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, plotY2+18, gpuFormatTick(tv))
|
||||||
|
fmt.Fprintf(&b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d" stroke="#333" stroke-width="1"/>`+"\n",
|
||||||
|
x, plotY2, x, plotY2+4)
|
||||||
|
}
|
||||||
|
b.WriteString("</g>\n")
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%d" font-family="sans-serif" font-size="13"`+
|
||||||
|
` fill="#333" text-anchor="middle">Time (seconds)</text>`+"\n",
|
||||||
|
plotX1+PW/2, plotY2+38)
|
||||||
|
|
||||||
|
// Y axes: [tempAxisX, plotX1, plotX2, clockAxisX]
|
||||||
|
axisLineX := [4]int{tempAxisX, plotX1, plotX2, clockAxisX}
|
||||||
|
axisRight := [4]bool{false, false, true, true}
|
||||||
|
// Label x positions (for rotated vertical text)
|
||||||
|
axisLabelX := [4]int{10, 68, 868, 950}
|
||||||
|
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
ax := axisLineX[i]
|
||||||
|
right := axisRight[i]
|
||||||
|
color := colors[i]
|
||||||
|
|
||||||
|
// Axis line
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d"`+
|
||||||
|
` stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
ax, plotY1, ax, plotY2, color)
|
||||||
|
|
||||||
|
// Ticks and tick labels
|
||||||
|
fmt.Fprintf(&b, `<g font-family="sans-serif" font-size="10" fill="%s">`+"\n", color)
|
||||||
|
for _, tick := range axes[i].ticks {
|
||||||
|
y := yv(tick, i)
|
||||||
|
if y < float64(plotY1) || y > float64(plotY2) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
dx := -5
|
||||||
|
textX := ax - 8
|
||||||
|
anchor := "end"
|
||||||
|
if right {
|
||||||
|
dx = 5
|
||||||
|
textX = ax + 8
|
||||||
|
anchor = "start"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"`+
|
||||||
|
` stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
ax, y, ax+dx, y, color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="%s" dy="4">%s</text>`+"\n",
|
||||||
|
textX, y, anchor, gpuFormatTick(tick))
|
||||||
|
}
|
||||||
|
b.WriteString("</g>\n")
|
||||||
|
|
||||||
|
// Axis label (rotated)
|
||||||
|
lx := axisLabelX[i]
|
||||||
|
fmt.Fprintf(&b, `<text transform="translate(%d,%d) rotate(-90)"`+
|
||||||
|
` font-family="sans-serif" font-size="12" fill="%s" text-anchor="middle">%s</text>`+"\n",
|
||||||
|
lx, plotY1+PH/2, color, axisLabel[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Data lines
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
var pts strings.Builder
|
||||||
|
for j := range rows {
|
||||||
|
x := xv(t[j])
|
||||||
|
y := yv(vals[i][j], i)
|
||||||
|
if j == 0 {
|
||||||
|
fmt.Fprintf(&pts, "%.1f,%.1f", x, y)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&pts, " %.1f,%.1f", x, y)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="1.5"/>`+"\n",
|
||||||
|
pts.String(), colors[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Legend
|
||||||
|
const legendY = 42
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
lx := plotX1 + i*(PW/4) + 10
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d"`+
|
||||||
|
` stroke="%s" stroke-width="2"/>`+"\n",
|
||||||
|
lx, legendY, lx+20, legendY, colors[i])
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%d" font-family="sans-serif" font-size="12" fill="#333">%s</text>`+"\n",
|
||||||
|
lx+25, legendY+4, seriesLabel[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString("</svg>\n")
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
ansiRed = "\033[31m"
|
||||||
|
ansiBlue = "\033[34m"
|
||||||
|
ansiGreen = "\033[32m"
|
||||||
|
ansiYellow = "\033[33m"
|
||||||
|
ansiReset = "\033[0m"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
termChartWidth = 70
|
||||||
|
termChartHeight = 12
|
||||||
|
)
|
||||||
|
|
||||||
|
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
||||||
|
// Suitable for display in the TUI screenOutput.
|
||||||
|
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||||
|
seen := make(map[int]bool)
|
||||||
|
var order []int
|
||||||
|
gpuMap := make(map[int][]GPUMetricRow)
|
||||||
|
for _, r := range rows {
|
||||||
|
if !seen[r.GPUIndex] {
|
||||||
|
seen[r.GPUIndex] = true
|
||||||
|
order = append(order, r.GPUIndex)
|
||||||
|
}
|
||||||
|
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||||
|
}
|
||||||
|
|
||||||
|
type seriesDef struct {
|
||||||
|
caption string
|
||||||
|
color string
|
||||||
|
fn func(GPUMetricRow) float64
|
||||||
|
}
|
||||||
|
defs := []seriesDef{
|
||||||
|
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||||
|
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||||
|
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||||
|
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
for _, gpuIdx := range order {
|
||||||
|
gr := gpuMap[gpuIdx]
|
||||||
|
if len(gr) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
|
||||||
|
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
|
||||||
|
for _, d := range defs {
|
||||||
|
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
|
||||||
|
termChartHeight, termChartWidth))
|
||||||
|
b.WriteRune('\n')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimRight(b.String(), "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// RenderGPULiveChart renders all GPU metrics on a single combined chart per GPU.
|
||||||
|
// Each series is normalised to its own min–max and drawn in a different colour.
|
||||||
|
// chartWidth controls the width of the plot area (Y-axis label uses 5 extra chars).
|
||||||
|
func RenderGPULiveChart(rows []GPUMetricRow, chartWidth int) string {
|
||||||
|
if chartWidth < 20 {
|
||||||
|
chartWidth = 70
|
||||||
|
}
|
||||||
|
const chartHeight = 14
|
||||||
|
|
||||||
|
seen := make(map[int]bool)
|
||||||
|
var order []int
|
||||||
|
gpuMap := make(map[int][]GPUMetricRow)
|
||||||
|
for _, r := range rows {
|
||||||
|
if !seen[r.GPUIndex] {
|
||||||
|
seen[r.GPUIndex] = true
|
||||||
|
order = append(order, r.GPUIndex)
|
||||||
|
}
|
||||||
|
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||||
|
}
|
||||||
|
|
||||||
|
type seriesDef struct {
|
||||||
|
label string
|
||||||
|
color string
|
||||||
|
unit string
|
||||||
|
fn func(GPUMetricRow) float64
|
||||||
|
}
|
||||||
|
defs := []seriesDef{
|
||||||
|
{"Usage", ansiBlue, "%", func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||||
|
{"Temp", ansiRed, "°C", func(r GPUMetricRow) float64 { return r.TempC }},
|
||||||
|
{"Power", ansiGreen, "W", func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
for _, gpuIdx := range order {
|
||||||
|
gr := gpuMap[gpuIdx]
|
||||||
|
if len(gr) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
elapsed := gr[len(gr)-1].ElapsedSec
|
||||||
|
|
||||||
|
// Build value slices for each series.
|
||||||
|
type seriesData struct {
|
||||||
|
seriesDef
|
||||||
|
vals []float64
|
||||||
|
mn float64
|
||||||
|
mx float64
|
||||||
|
}
|
||||||
|
var series []seriesData
|
||||||
|
for _, d := range defs {
|
||||||
|
vals := extractGPUField(gr, d.fn)
|
||||||
|
mn, mx := gpuMinMax(vals)
|
||||||
|
if mn == mx {
|
||||||
|
mx = mn + 1
|
||||||
|
}
|
||||||
|
series = append(series, seriesData{d, vals, mn, mx})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shared character grid: row 0 = top (max), row chartHeight = bottom (min).
|
||||||
|
type cell struct {
|
||||||
|
ch rune
|
||||||
|
color string
|
||||||
|
}
|
||||||
|
grid := make([][]cell, chartHeight+1)
|
||||||
|
for r := range grid {
|
||||||
|
grid[r] = make([]cell, chartWidth)
|
||||||
|
for c := range grid[r] {
|
||||||
|
grid[r][c] = cell{' ', ""}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Plot each series onto the shared grid.
|
||||||
|
for _, s := range series {
|
||||||
|
w := chartWidth
|
||||||
|
if len(s.vals) < w {
|
||||||
|
w = len(s.vals)
|
||||||
|
}
|
||||||
|
data := gpuDownsample(s.vals, w)
|
||||||
|
prevRow := -1
|
||||||
|
for x, v := range data {
|
||||||
|
row := chartHeight - int(math.Round((v-s.mn)/(s.mx-s.mn)*float64(chartHeight)))
|
||||||
|
if row < 0 {
|
||||||
|
row = 0
|
||||||
|
}
|
||||||
|
if row > chartHeight {
|
||||||
|
row = chartHeight
|
||||||
|
}
|
||||||
|
if prevRow < 0 || prevRow == row {
|
||||||
|
grid[row][x] = cell{'─', s.color}
|
||||||
|
} else {
|
||||||
|
lo, hi := prevRow, row
|
||||||
|
if lo > hi {
|
||||||
|
lo, hi = hi, lo
|
||||||
|
}
|
||||||
|
for y := lo + 1; y < hi; y++ {
|
||||||
|
grid[y][x] = cell{'│', s.color}
|
||||||
|
}
|
||||||
|
if prevRow < row {
|
||||||
|
grid[prevRow][x] = cell{'╮', s.color}
|
||||||
|
grid[row][x] = cell{'╰', s.color}
|
||||||
|
} else {
|
||||||
|
grid[prevRow][x] = cell{'╯', s.color}
|
||||||
|
grid[row][x] = cell{'╭', s.color}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prevRow = row
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render: Y axis + data rows.
|
||||||
|
fmt.Fprintf(&b, "GPU %d (%.0fs) each series normalised to its range\n", gpuIdx, elapsed)
|
||||||
|
for r := 0; r <= chartHeight; r++ {
|
||||||
|
// Y axis label: 100% at top, 50% in middle, 0% at bottom.
|
||||||
|
switch r {
|
||||||
|
case 0:
|
||||||
|
fmt.Fprintf(&b, "%4s┤", "100%")
|
||||||
|
case chartHeight / 2:
|
||||||
|
fmt.Fprintf(&b, "%4s┤", "50%")
|
||||||
|
case chartHeight:
|
||||||
|
fmt.Fprintf(&b, "%4s┤", "0%")
|
||||||
|
default:
|
||||||
|
fmt.Fprintf(&b, "%4s│", "")
|
||||||
|
}
|
||||||
|
for c := 0; c < chartWidth; c++ {
|
||||||
|
cl := grid[r][c]
|
||||||
|
if cl.color != "" {
|
||||||
|
b.WriteString(cl.color)
|
||||||
|
b.WriteRune(cl.ch)
|
||||||
|
b.WriteString(ansiReset)
|
||||||
|
} else {
|
||||||
|
b.WriteRune(' ')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteRune('\n')
|
||||||
|
}
|
||||||
|
// Bottom axis.
|
||||||
|
b.WriteString(" └")
|
||||||
|
b.WriteString(strings.Repeat("─", chartWidth))
|
||||||
|
b.WriteRune('\n')
|
||||||
|
|
||||||
|
// Legend with current (last) values.
|
||||||
|
b.WriteString(" ")
|
||||||
|
for i, s := range series {
|
||||||
|
last := s.vals[len(s.vals)-1]
|
||||||
|
b.WriteString(s.color)
|
||||||
|
fmt.Fprintf(&b, "▐ %s: %.0f%s", s.label, last, s.unit)
|
||||||
|
b.WriteString(ansiReset)
|
||||||
|
if i < len(series)-1 {
|
||||||
|
b.WriteString(" ")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteRune('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimRight(b.String(), "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
||||||
|
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
||||||
|
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
||||||
|
if len(vals) == 0 {
|
||||||
|
return caption + "\n"
|
||||||
|
}
|
||||||
|
|
||||||
|
mn, mx := gpuMinMax(vals)
|
||||||
|
if mn == mx {
|
||||||
|
mx = mn + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the smaller of width or len(vals) to avoid stretching sparse data.
|
||||||
|
w := width
|
||||||
|
if len(vals) < w {
|
||||||
|
w = len(vals)
|
||||||
|
}
|
||||||
|
data := gpuDownsample(vals, w)
|
||||||
|
|
||||||
|
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
|
||||||
|
row := make([]int, w)
|
||||||
|
for i, v := range data {
|
||||||
|
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
|
||||||
|
if r < 0 {
|
||||||
|
r = 0
|
||||||
|
}
|
||||||
|
if r > height {
|
||||||
|
r = height
|
||||||
|
}
|
||||||
|
row[i] = r
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fill the character grid.
|
||||||
|
grid := make([][]rune, height+1)
|
||||||
|
for i := range grid {
|
||||||
|
grid[i] = make([]rune, w)
|
||||||
|
for j := range grid[i] {
|
||||||
|
grid[i][j] = ' '
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for x := 0; x < w; x++ {
|
||||||
|
r := row[x]
|
||||||
|
if x == 0 {
|
||||||
|
grid[r][0] = '─'
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
p := row[x-1]
|
||||||
|
switch {
|
||||||
|
case r == p:
|
||||||
|
grid[r][x] = '─'
|
||||||
|
case r < p: // value went up (row index decreased toward top)
|
||||||
|
grid[r][x] = '╭'
|
||||||
|
grid[p][x] = '╯'
|
||||||
|
for y := r + 1; y < p; y++ {
|
||||||
|
grid[y][x] = '│'
|
||||||
|
}
|
||||||
|
default: // r > p, value went down
|
||||||
|
grid[p][x] = '╮'
|
||||||
|
grid[r][x] = '╰'
|
||||||
|
for y := p + 1; y < r; y++ {
|
||||||
|
grid[y][x] = '│'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Y axis tick labels.
|
||||||
|
ticks := gpuNiceTicks(mn, mx, height/2)
|
||||||
|
tickAtRow := make(map[int]string)
|
||||||
|
labelWidth := 4
|
||||||
|
for _, t := range ticks {
|
||||||
|
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
|
||||||
|
if r < 0 || r > height {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s := gpuFormatTick(t)
|
||||||
|
tickAtRow[r] = s
|
||||||
|
if len(s) > labelWidth {
|
||||||
|
labelWidth = len(s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
for r := 0; r <= height; r++ {
|
||||||
|
label := tickAtRow[r]
|
||||||
|
fmt.Fprintf(&b, "%*s", labelWidth, label)
|
||||||
|
switch {
|
||||||
|
case label != "":
|
||||||
|
b.WriteRune('┤')
|
||||||
|
case r == height:
|
||||||
|
b.WriteRune('┼')
|
||||||
|
default:
|
||||||
|
b.WriteRune('│')
|
||||||
|
}
|
||||||
|
b.WriteString(color)
|
||||||
|
b.WriteString(string(grid[r]))
|
||||||
|
b.WriteString(ansiReset)
|
||||||
|
b.WriteRune('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bottom axis.
|
||||||
|
b.WriteString(strings.Repeat(" ", labelWidth))
|
||||||
|
b.WriteRune('└')
|
||||||
|
b.WriteString(strings.Repeat("─", w))
|
||||||
|
b.WriteRune('\n')
|
||||||
|
|
||||||
|
// Caption centered under the chart.
|
||||||
|
if caption != "" {
|
||||||
|
total := labelWidth + 1 + w
|
||||||
|
if pad := (total - len(caption)) / 2; pad > 0 {
|
||||||
|
b.WriteString(strings.Repeat(" ", pad))
|
||||||
|
}
|
||||||
|
b.WriteString(caption)
|
||||||
|
b.WriteRune('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
|
||||||
|
v := make([]float64, len(rows))
|
||||||
|
for i, r := range rows {
|
||||||
|
v[i] = fn(r)
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
|
||||||
|
func gpuDownsample(vals []float64, w int) []float64 {
|
||||||
|
n := len(vals)
|
||||||
|
if n == 0 {
|
||||||
|
return make([]float64, w)
|
||||||
|
}
|
||||||
|
result := make([]float64, w)
|
||||||
|
if n >= w {
|
||||||
|
counts := make([]int, w)
|
||||||
|
for i, v := range vals {
|
||||||
|
bucket := i * w / n
|
||||||
|
if bucket >= w {
|
||||||
|
bucket = w - 1
|
||||||
|
}
|
||||||
|
result[bucket] += v
|
||||||
|
counts[bucket]++
|
||||||
|
}
|
||||||
|
for i := range result {
|
||||||
|
if counts[i] > 0 {
|
||||||
|
result[i] /= float64(counts[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Nearest-neighbour upsample.
|
||||||
|
for i := range result {
|
||||||
|
src := i * (n - 1) / (w - 1)
|
||||||
|
if src >= n {
|
||||||
|
src = n - 1
|
||||||
|
}
|
||||||
|
result[i] = vals[src]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuMinMax(vals []float64) (float64, float64) {
|
||||||
|
if len(vals) == 0 {
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
mn, mx := vals[0], vals[0]
|
||||||
|
for _, v := range vals[1:] {
|
||||||
|
if v < mn {
|
||||||
|
mn = v
|
||||||
|
}
|
||||||
|
if v > mx {
|
||||||
|
mx = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return mn, mx
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuNiceTicks(mn, mx float64, targetCount int) []float64 {
|
||||||
|
if mn == mx {
|
||||||
|
mn -= 1
|
||||||
|
mx += 1
|
||||||
|
}
|
||||||
|
r := mx - mn
|
||||||
|
step := math.Pow(10, math.Floor(math.Log10(r/float64(targetCount))))
|
||||||
|
for _, f := range []float64{1, 2, 5, 10} {
|
||||||
|
if r/(f*step) <= float64(targetCount)*1.5 {
|
||||||
|
step = f * step
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lo := math.Floor(mn/step) * step
|
||||||
|
hi := math.Ceil(mx/step) * step
|
||||||
|
var ticks []float64
|
||||||
|
for v := lo; v <= hi+step*0.001; v += step {
|
||||||
|
ticks = append(ticks, math.Round(v*1e9)/1e9)
|
||||||
|
}
|
||||||
|
return ticks
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuFormatTick(v float64) string {
|
||||||
|
if v == math.Trunc(v) {
|
||||||
|
return strconv.Itoa(int(v))
|
||||||
|
}
|
||||||
|
return strconv.FormatFloat(v, 'f', 1, 64)
|
||||||
|
}
|
||||||
105
audit/internal/platform/install.go
Normal file
105
audit/internal/platform/install.go
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// InstallDisk describes a candidate disk for installation.
|
||||||
|
type InstallDisk struct {
|
||||||
|
Device string // e.g. /dev/sda
|
||||||
|
Model string
|
||||||
|
Size string // human-readable, e.g. "500G"
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListInstallDisks returns block devices suitable for installation.
|
||||||
|
// Excludes USB drives and the current live boot medium.
|
||||||
|
func (s *System) ListInstallDisks() ([]InstallDisk, error) {
|
||||||
|
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,MODEL,SIZE,TYPE,TRAN").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("lsblk: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
bootDev := findLiveBootDevice()
|
||||||
|
|
||||||
|
var disks []InstallDisk
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
// NAME MODEL SIZE TYPE TRAN — model may have spaces so we parse from end
|
||||||
|
if len(fields) < 4 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Last field: TRAN, second-to-last: TYPE, third-to-last: SIZE
|
||||||
|
tran := fields[len(fields)-1]
|
||||||
|
typ := fields[len(fields)-2]
|
||||||
|
size := fields[len(fields)-3]
|
||||||
|
name := fields[0]
|
||||||
|
model := strings.Join(fields[1:len(fields)-3], " ")
|
||||||
|
|
||||||
|
if typ != "disk" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.EqualFold(tran, "usb") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
device := "/dev/" + name
|
||||||
|
if device == bootDev {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
disks = append(disks, InstallDisk{
|
||||||
|
Device: device,
|
||||||
|
Model: strings.TrimSpace(model),
|
||||||
|
Size: size,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return disks, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// findLiveBootDevice returns the block device backing /run/live/medium (if any).
|
||||||
|
func findLiveBootDevice() string {
|
||||||
|
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", "/run/live/medium").Output()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
src := strings.TrimSpace(string(out))
|
||||||
|
if src == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
// Strip partition suffix to get the whole disk device.
|
||||||
|
// e.g. /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
|
||||||
|
out2, err := exec.Command("lsblk", "-no", "PKNAME", src).Output()
|
||||||
|
if err != nil || strings.TrimSpace(string(out2)) == "" {
|
||||||
|
return src
|
||||||
|
}
|
||||||
|
return "/dev/" + strings.TrimSpace(string(out2))
|
||||||
|
}
|
||||||
|
|
||||||
|
// InstallToDisk runs bee-install <device> <logfile> and streams output to logFile.
|
||||||
|
// The context can be used to cancel.
|
||||||
|
func (s *System) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||||
|
cmd := exec.CommandContext(ctx, "bee-install", device, logFile)
|
||||||
|
return cmd.Run()
|
||||||
|
}
|
||||||
|
|
||||||
|
// InstallLogPath returns the default install log path for a given device.
|
||||||
|
func InstallLogPath(device string) string {
|
||||||
|
safe := strings.NewReplacer("/", "_", " ", "_").Replace(device)
|
||||||
|
return "/tmp/bee-install" + safe + ".log"
|
||||||
|
}
|
||||||
|
|
||||||
|
// DiskLabel returns a display label for a disk.
|
||||||
|
func (d InstallDisk) Label() string {
|
||||||
|
model := d.Model
|
||||||
|
if model == "" {
|
||||||
|
model = "Unknown"
|
||||||
|
}
|
||||||
|
sizeBytes, err := strconv.ParseInt(strings.TrimSuffix(d.Size, "B"), 10, 64)
|
||||||
|
_ = sizeBytes
|
||||||
|
_ = err
|
||||||
|
return fmt.Sprintf("%s %s %s", d.Device, d.Size, model)
|
||||||
|
}
|
||||||
45
audit/internal/platform/live_metrics.go
Normal file
45
audit/internal/platform/live_metrics.go
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||||
|
// collected for the web UI metrics page.
|
||||||
|
type LiveMetricSample struct {
|
||||||
|
Timestamp time.Time `json:"ts"`
|
||||||
|
Fans []FanReading `json:"fans"`
|
||||||
|
Temps []TempReading `json:"temps"`
|
||||||
|
PowerW float64 `json:"power_w"`
|
||||||
|
GPUs []GPUMetricRow `json:"gpus"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// TempReading is a named temperature sensor value.
|
||||||
|
type TempReading struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Celsius float64 `json:"celsius"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// SampleLiveMetrics collects a single metrics snapshot from all available
|
||||||
|
// sources: GPU (via nvidia-smi), fans and temperatures (via ipmitool/sensors),
|
||||||
|
// and system power (via ipmitool dcmi). Missing sources are silently skipped.
|
||||||
|
func SampleLiveMetrics() LiveMetricSample {
|
||||||
|
s := LiveMetricSample{Timestamp: time.Now().UTC()}
|
||||||
|
|
||||||
|
// GPU metrics — skipped silently if nvidia-smi unavailable
|
||||||
|
gpus, _ := SampleGPUMetrics(nil)
|
||||||
|
s.GPUs = gpus
|
||||||
|
|
||||||
|
// Fan speeds — skipped silently if ipmitool unavailable
|
||||||
|
fans, _ := sampleFanSpeeds()
|
||||||
|
s.Fans = fans
|
||||||
|
|
||||||
|
// CPU/system temperature — returns 0 if unavailable
|
||||||
|
cpuTemp := sampleCPUMaxTemp()
|
||||||
|
if cpuTemp > 0 {
|
||||||
|
s.Temps = append(s.Temps, TempReading{Name: "CPU", Celsius: cpuTemp})
|
||||||
|
}
|
||||||
|
|
||||||
|
// System power — returns 0 if unavailable
|
||||||
|
s.PowerW = sampleSystemPower()
|
||||||
|
|
||||||
|
return s
|
||||||
|
}
|
||||||
156
audit/internal/platform/network.go
Normal file
156
audit/internal/platform/network.go
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *System) ListInterfaces() ([]InterfaceInfo, error) {
|
||||||
|
names, err := listInterfaceNames()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([]InterfaceInfo, 0, len(names))
|
||||||
|
for _, name := range names {
|
||||||
|
state := "unknown"
|
||||||
|
if raw, err := exec.Command("ip", "-o", "link", "show", name).Output(); err == nil {
|
||||||
|
fields := strings.Fields(string(raw))
|
||||||
|
if len(fields) >= 9 {
|
||||||
|
state = fields[8]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var ipv4 []string
|
||||||
|
if raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", name).Output(); err == nil {
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) >= 4 {
|
||||||
|
ipv4 = append(ipv4, fields[3])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) DefaultRoute() string {
|
||||||
|
raw, err := exec.Command("ip", "route", "show", "default").Output()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
fields := strings.Fields(string(raw))
|
||||||
|
for i := 0; i < len(fields)-1; i++ {
|
||||||
|
if fields[i] == "via" {
|
||||||
|
return fields[i+1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) DHCPOne(iface string) (string, error) {
|
||||||
|
var out bytes.Buffer
|
||||||
|
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
|
||||||
|
fmt.Fprintf(&out, "WARN: ip link set up failed: %v\n", err)
|
||||||
|
}
|
||||||
|
if raw, err := exec.Command("dhclient", "-r", iface).CombinedOutput(); err == nil {
|
||||||
|
out.Write(raw)
|
||||||
|
} else if len(raw) > 0 {
|
||||||
|
out.Write(raw)
|
||||||
|
}
|
||||||
|
raw, err := exec.Command("dhclient", "-4", "-v", iface).CombinedOutput()
|
||||||
|
out.Write(raw)
|
||||||
|
if err != nil {
|
||||||
|
return out.String(), err
|
||||||
|
}
|
||||||
|
return out.String(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) DHCPAll() (string, error) {
|
||||||
|
ifaces, err := listInterfaceNames()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
var out strings.Builder
|
||||||
|
for _, iface := range ifaces {
|
||||||
|
fmt.Fprintf(&out, "[%s]\n", iface)
|
||||||
|
log, err := s.DHCPOne(iface)
|
||||||
|
out.WriteString(log)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(&out, "ERROR: %v\n", err)
|
||||||
|
}
|
||||||
|
out.WriteString("\n")
|
||||||
|
}
|
||||||
|
return out.String(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) SetStaticIPv4(cfg StaticIPv4Config) (string, error) {
|
||||||
|
if cfg.Interface == "" || cfg.Address == "" || cfg.Prefix == "" {
|
||||||
|
return "", fmt.Errorf("interface, address, and prefix are required")
|
||||||
|
}
|
||||||
|
|
||||||
|
dns := cfg.DNS
|
||||||
|
if len(dns) == 0 {
|
||||||
|
dns = []string{"77.88.8.8", "77.88.8.1", "1.1.1.1", "8.8.8.8"}
|
||||||
|
}
|
||||||
|
|
||||||
|
var out strings.Builder
|
||||||
|
_ = exec.Command("ip", "link", "set", cfg.Interface, "up").Run()
|
||||||
|
_ = exec.Command("ip", "addr", "flush", "dev", cfg.Interface).Run()
|
||||||
|
if raw, err := exec.Command("ip", "addr", "add", cfg.Address+"/"+cfg.Prefix, "dev", cfg.Interface).CombinedOutput(); err != nil {
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
|
out.WriteString("address configured\n")
|
||||||
|
if cfg.Gateway != "" {
|
||||||
|
_ = exec.Command("ip", "route", "del", "default").Run()
|
||||||
|
if raw, err := exec.Command("ip", "route", "add", "default", "via", cfg.Gateway, "dev", cfg.Interface).CombinedOutput(); err != nil {
|
||||||
|
return out.String() + string(raw), err
|
||||||
|
}
|
||||||
|
out.WriteString("default route configured\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
var resolv strings.Builder
|
||||||
|
for _, dnsServer := range dns {
|
||||||
|
dnsServer = strings.TrimSpace(dnsServer)
|
||||||
|
if dnsServer == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&resolv, "nameserver %s\n", dnsServer)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile("/etc/resolv.conf", []byte(resolv.String()), 0644); err != nil {
|
||||||
|
return out.String(), err
|
||||||
|
}
|
||||||
|
out.WriteString("dns configured\n")
|
||||||
|
return out.String(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func listInterfaceNames() ([]string, error) {
|
||||||
|
raw, err := exec.Command("ip", "-o", "link", "show").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var out []string
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
fields := strings.SplitN(line, ": ", 3)
|
||||||
|
if len(fields) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := fields[1]
|
||||||
|
if name == "lo" || strings.HasPrefix(name, "docker") || strings.HasPrefix(name, "virbr") ||
|
||||||
|
strings.HasPrefix(name, "veth") || strings.HasPrefix(name, "tun") ||
|
||||||
|
strings.HasPrefix(name, "tap") || strings.HasPrefix(name, "br-") ||
|
||||||
|
strings.HasPrefix(name, "bond") || strings.HasPrefix(name, "dummy") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, name)
|
||||||
|
}
|
||||||
|
sort.Strings(out)
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
43
audit/internal/platform/parse.go
Normal file
43
audit/internal/platform/parse.go
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
func parseLSBLKPairs(line string) map[string]string {
|
||||||
|
out := map[string]string{}
|
||||||
|
for _, part := range splitQuotedFields(line) {
|
||||||
|
idx := strings.Index(part, "=")
|
||||||
|
if idx <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := part[:idx]
|
||||||
|
value := strings.Trim(part[idx+1:], `"`)
|
||||||
|
out[key] = value
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func splitQuotedFields(s string) []string {
|
||||||
|
var out []string
|
||||||
|
var cur strings.Builder
|
||||||
|
inQuotes := false
|
||||||
|
for _, r := range s {
|
||||||
|
switch r {
|
||||||
|
case '"':
|
||||||
|
inQuotes = !inQuotes
|
||||||
|
cur.WriteRune(r)
|
||||||
|
case ' ':
|
||||||
|
if inQuotes {
|
||||||
|
cur.WriteRune(r)
|
||||||
|
} else if cur.Len() > 0 {
|
||||||
|
out = append(out, cur.String())
|
||||||
|
cur.Reset()
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
cur.WriteRune(r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cur.Len() > 0 {
|
||||||
|
out = append(out, cur.String())
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
214
audit/internal/platform/runtime.go
Normal file
214
audit/internal/platform/runtime.go
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
var runtimeRequiredTools = []string{
|
||||||
|
"dmidecode",
|
||||||
|
"lspci",
|
||||||
|
"lsblk",
|
||||||
|
"smartctl",
|
||||||
|
"nvme",
|
||||||
|
"ipmitool",
|
||||||
|
"dhclient",
|
||||||
|
"mount",
|
||||||
|
}
|
||||||
|
|
||||||
|
var runtimeTrackedServices = []string{
|
||||||
|
"bee-network",
|
||||||
|
"bee-nvidia",
|
||||||
|
"bee-preflight",
|
||||||
|
"bee-audit",
|
||||||
|
"bee-web",
|
||||||
|
"bee-sshsetup",
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
|
checkedAt := time.Now().UTC().Format(time.RFC3339)
|
||||||
|
health := schema.RuntimeHealth{
|
||||||
|
Status: "OK",
|
||||||
|
CheckedAt: checkedAt,
|
||||||
|
ExportDir: strings.TrimSpace(exportDir),
|
||||||
|
}
|
||||||
|
|
||||||
|
if health.ExportDir != "" {
|
||||||
|
if err := os.MkdirAll(health.ExportDir, 0755); err != nil {
|
||||||
|
health.Status = "FAILED"
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "export_dir_unavailable",
|
||||||
|
Severity: "critical",
|
||||||
|
Description: err.Error(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
interfaces, err := s.ListInterfaces()
|
||||||
|
if err == nil {
|
||||||
|
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
||||||
|
hasIPv4 := false
|
||||||
|
missingIPv4 := false
|
||||||
|
for _, iface := range interfaces {
|
||||||
|
outcome := "no_offer"
|
||||||
|
if len(iface.IPv4) > 0 {
|
||||||
|
outcome = "lease_acquired"
|
||||||
|
hasIPv4 = true
|
||||||
|
} else if strings.EqualFold(iface.State, "DOWN") {
|
||||||
|
outcome = "link_down"
|
||||||
|
} else {
|
||||||
|
missingIPv4 = true
|
||||||
|
}
|
||||||
|
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
||||||
|
Name: iface.Name,
|
||||||
|
State: iface.State,
|
||||||
|
IPv4: iface.IPv4,
|
||||||
|
Outcome: outcome,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case hasIPv4 && !missingIPv4:
|
||||||
|
health.NetworkStatus = "OK"
|
||||||
|
case hasIPv4:
|
||||||
|
health.NetworkStatus = "PARTIAL"
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "dhcp_partial",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "At least one interface did not obtain IPv4 connectivity.",
|
||||||
|
})
|
||||||
|
default:
|
||||||
|
health.NetworkStatus = "FAILED"
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "dhcp_failed",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "No physical interface obtained IPv4 connectivity.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vendor := s.DetectGPUVendor()
|
||||||
|
for _, tool := range s.runtimeToolStatuses(vendor) {
|
||||||
|
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
|
||||||
|
Name: tool.Name,
|
||||||
|
Path: tool.Path,
|
||||||
|
OK: tool.OK,
|
||||||
|
})
|
||||||
|
if !tool.OK {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "tool_missing",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "Required tool missing: " + tool.Name,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, name := range runtimeTrackedServices {
|
||||||
|
health.Services = append(health.Services, schema.RuntimeServiceStatus{
|
||||||
|
Name: name,
|
||||||
|
Status: s.ServiceState(name),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
s.collectGPURuntimeHealth(vendor, &health)
|
||||||
|
|
||||||
|
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||||
|
health.Status = "PARTIAL"
|
||||||
|
}
|
||||||
|
return health, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func commandText(name string, args ...string) string {
|
||||||
|
raw, err := exec.Command(name, args...).CombinedOutput()
|
||||||
|
if err != nil && len(raw) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return string(raw)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||||
|
tools := s.CheckTools(runtimeRequiredTools)
|
||||||
|
switch vendor {
|
||||||
|
case "nvidia":
|
||||||
|
tools = append(tools, s.CheckTools([]string{
|
||||||
|
"nvidia-smi",
|
||||||
|
"nvidia-bug-report.sh",
|
||||||
|
"bee-gpu-stress",
|
||||||
|
})...)
|
||||||
|
case "amd":
|
||||||
|
tool := ToolStatus{Name: "rocm-smi"}
|
||||||
|
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||||
|
tool.Path = cmd[0]
|
||||||
|
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
|
||||||
|
tool.Path = cmd[1]
|
||||||
|
}
|
||||||
|
tool.OK = true
|
||||||
|
}
|
||||||
|
tools = append(tools, tool)
|
||||||
|
}
|
||||||
|
return tools
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||||
|
lsmodText := commandText("lsmod")
|
||||||
|
|
||||||
|
switch vendor {
|
||||||
|
case "nvidia":
|
||||||
|
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||||
|
if !health.DriverReady {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_kernel_module_missing",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "NVIDIA kernel module is not loaded.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_modeset_failed",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
||||||
|
health.DriverReady = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
||||||
|
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||||
|
if err == nil {
|
||||||
|
health.CUDAReady = true
|
||||||
|
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "cuda_runtime_not_ready",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "CUDA runtime is not ready for GPU SAT.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "amd":
|
||||||
|
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
|
||||||
|
if !health.DriverReady {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "amdgpu_kernel_module_missing",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "AMD GPU driver is not loaded.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
out, err := runROCmSMI("--showproductname", "--csv")
|
||||||
|
if err == nil && strings.TrimSpace(string(out)) != "" {
|
||||||
|
health.CUDAReady = true
|
||||||
|
health.DriverReady = true
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "rocm_smi_unavailable",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "ROCm SMI is not available for AMD GPU SAT.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
710
audit/internal/platform/sat.go
Normal file
710
audit/internal/platform/sat.go
Normal file
@@ -0,0 +1,710 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/tar"
|
||||||
|
"compress/gzip"
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
satExecCommand = exec.Command
|
||||||
|
satLookPath = exec.LookPath
|
||||||
|
satGlob = filepath.Glob
|
||||||
|
satStat = os.Stat
|
||||||
|
|
||||||
|
rocmSMIExecutableGlobs = []string{
|
||||||
|
"/opt/rocm/bin/rocm-smi",
|
||||||
|
"/opt/rocm-*/bin/rocm-smi",
|
||||||
|
}
|
||||||
|
rocmSMIScriptGlobs = []string{
|
||||||
|
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
||||||
|
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||||
|
type NvidiaGPU struct {
|
||||||
|
Index int
|
||||||
|
Name string
|
||||||
|
MemoryMB int
|
||||||
|
}
|
||||||
|
|
||||||
|
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||||
|
type AMDGPUInfo struct {
|
||||||
|
Index int
|
||||||
|
Name string
|
||||||
|
}
|
||||||
|
|
||||||
|
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
||||||
|
func (s *System) DetectGPUVendor() string {
|
||||||
|
if _, err := os.Stat("/dev/nvidia0"); err == nil {
|
||||||
|
return "nvidia"
|
||||||
|
}
|
||||||
|
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||||
|
return "amd"
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
|
||||||
|
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
||||||
|
out, err := runROCmSMI("--showproductname", "--csv")
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("rocm-smi: %w", err)
|
||||||
|
}
|
||||||
|
var gpus []AMDGPUInfo
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" || strings.HasPrefix(strings.ToLower(line), "device") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.SplitN(line, ",", 2)
|
||||||
|
name := ""
|
||||||
|
if len(parts) >= 2 {
|
||||||
|
name = strings.TrimSpace(parts[1])
|
||||||
|
}
|
||||||
|
idx := len(gpus)
|
||||||
|
gpus = append(gpus, AMDGPUInfo{Index: idx, Name: name})
|
||||||
|
}
|
||||||
|
return gpus, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
||||||
|
func (s *System) RunAMDAcceptancePack(baseDir string) (string, error) {
|
||||||
|
return runAcceptancePack(baseDir, "gpu-amd", []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
||||||
|
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
|
{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||||
|
func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||||
|
out, err := exec.Command("nvidia-smi",
|
||||||
|
"--query-gpu=index,name,memory.total",
|
||||||
|
"--format=csv,noheader,nounits").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||||
|
}
|
||||||
|
var gpus []NvidiaGPU
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.SplitN(line, ", ", 3)
|
||||||
|
if len(parts) != 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
memMB, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
|
||||||
|
gpus = append(gpus, NvidiaGPU{
|
||||||
|
Index: idx,
|
||||||
|
Name: strings.TrimSpace(parts[1]),
|
||||||
|
MemoryMB: memMB,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return gpus, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||||
|
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||||
|
func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) {
|
||||||
|
// detect GPU count
|
||||||
|
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||||
|
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||||
|
if gpuCount < 1 {
|
||||||
|
gpuCount = 1
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "02-all-reduce-perf.log", cmd: []string{
|
||||||
|
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||||
|
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||||
|
}},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||||
|
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
||||||
|
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
|
||||||
|
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||||
|
// ctx cancellation kills the running job.
|
||||||
|
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) {
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||||
|
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||||
|
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||||
|
return runAcceptancePack(baseDir, "memory", []satJob{
|
||||||
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
|
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||||
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
durationSec = 60
|
||||||
|
}
|
||||||
|
return runAcceptancePack(baseDir, "cpu", []satJob{
|
||||||
|
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
||||||
|
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
||||||
|
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
||||||
|
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||||
|
if baseDir == "" {
|
||||||
|
baseDir = "/var/log/bee-sat"
|
||||||
|
}
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "storage-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
|
|
||||||
|
devices, err := listStorageDevices()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
sort.Strings(devices)
|
||||||
|
|
||||||
|
var summary strings.Builder
|
||||||
|
stats := satStats{}
|
||||||
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
|
if len(devices) == 0 {
|
||||||
|
fmt.Fprintln(&summary, "devices=0")
|
||||||
|
stats.Unsupported++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&summary, "devices=%d\n", len(devices))
|
||||||
|
}
|
||||||
|
|
||||||
|
for index, devPath := range devices {
|
||||||
|
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||||
|
commands := storageSATCommands(devPath)
|
||||||
|
for cmdIndex, job := range commands {
|
||||||
|
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||||||
|
out, err := runSATCommand(verboseLog, job.name, job.cmd)
|
||||||
|
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||||
|
return "", writeErr
|
||||||
|
}
|
||||||
|
status, rc := classifySATResult(job.name, out, err)
|
||||||
|
stats.Add(status)
|
||||||
|
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
|
||||||
|
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||||
|
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writeSATStats(&summary, stats)
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
|
||||||
|
if err := createTarGz(archive, runDir); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return archive, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type satJob struct {
|
||||||
|
name string
|
||||||
|
cmd []string
|
||||||
|
env []string // extra env vars (appended to os.Environ)
|
||||||
|
collectGPU bool // collect GPU metrics via nvidia-smi while this job runs
|
||||||
|
gpuIndices []int // GPU indices to collect metrics for (empty = all)
|
||||||
|
}
|
||||||
|
|
||||||
|
type satStats struct {
|
||||||
|
OK int
|
||||||
|
Failed int
|
||||||
|
Unsupported int
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaSATJobs() []satJob {
|
||||||
|
seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
|
||||||
|
sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
|
||||||
|
return []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
|
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
|
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||||
|
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||||
|
if baseDir == "" {
|
||||||
|
baseDir = "/var/log/bee-sat"
|
||||||
|
}
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, prefix+"-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
|
|
||||||
|
var summary strings.Builder
|
||||||
|
stats := satStats{}
|
||||||
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
|
for _, job := range jobs {
|
||||||
|
cmd := make([]string, 0, len(job.cmd))
|
||||||
|
for _, arg := range job.cmd {
|
||||||
|
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||||
|
}
|
||||||
|
out, err := runSATCommand(verboseLog, job.name, cmd)
|
||||||
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||||
|
return "", writeErr
|
||||||
|
}
|
||||||
|
status, rc := classifySATResult(job.name, out, err)
|
||||||
|
stats.Add(status)
|
||||||
|
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||||
|
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||||
|
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||||
|
}
|
||||||
|
writeSATStats(&summary, stats)
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||||
|
if err := createTarGz(archive, runDir); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return archive, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||||
|
if diagLevel < 1 || diagLevel > 4 {
|
||||||
|
diagLevel = 3
|
||||||
|
}
|
||||||
|
diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)}
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
ids := make([]string, len(gpuIndices))
|
||||||
|
for i, idx := range gpuIndices {
|
||||||
|
ids[i] = strconv.Itoa(idx)
|
||||||
|
}
|
||||||
|
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||||
|
}
|
||||||
|
return []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
|
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
|
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) {
|
||||||
|
if baseDir == "" {
|
||||||
|
baseDir = "/var/log/bee-sat"
|
||||||
|
}
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, prefix+"-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
|
|
||||||
|
var summary strings.Builder
|
||||||
|
stats := satStats{}
|
||||||
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
|
for _, job := range jobs {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
cmd := make([]string, 0, len(job.cmd))
|
||||||
|
for _, arg := range job.cmd {
|
||||||
|
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||||
|
}
|
||||||
|
|
||||||
|
var out []byte
|
||||||
|
var err error
|
||||||
|
|
||||||
|
if job.collectGPU {
|
||||||
|
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir)
|
||||||
|
} else {
|
||||||
|
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env)
|
||||||
|
}
|
||||||
|
|
||||||
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||||
|
return "", writeErr
|
||||||
|
}
|
||||||
|
status, rc := classifySATResult(job.name, out, err)
|
||||||
|
stats.Add(status)
|
||||||
|
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||||
|
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||||
|
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||||
|
}
|
||||||
|
writeSATStats(&summary, stats)
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||||
|
if err := createTarGz(archive, runDir); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return archive, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
||||||
|
start := time.Now().UTC()
|
||||||
|
resolvedCmd, err := resolveSATCommand(cmd)
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||||
|
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||||
|
"rc: 1",
|
||||||
|
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
return []byte(err.Error() + "\n"), err
|
||||||
|
}
|
||||||
|
|
||||||
|
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||||
|
if len(env) > 0 {
|
||||||
|
c.Env = append(os.Environ(), env...)
|
||||||
|
}
|
||||||
|
out, err := c.CombinedOutput()
|
||||||
|
|
||||||
|
rc := 0
|
||||||
|
if err != nil {
|
||||||
|
rc = 1
|
||||||
|
}
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||||
|
fmt.Sprintf("rc: %d", rc),
|
||||||
|
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func listStorageDevices() ([]string, error) {
|
||||||
|
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return parseStorageDevices(string(out)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func storageSATCommands(devPath string) []satJob {
|
||||||
|
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||||
|
return []satJob{
|
||||||
|
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||||
|
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||||
|
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return []satJob{
|
||||||
|
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||||
|
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *satStats) Add(status string) {
|
||||||
|
switch status {
|
||||||
|
case "OK":
|
||||||
|
s.OK++
|
||||||
|
case "UNSUPPORTED":
|
||||||
|
s.Unsupported++
|
||||||
|
default:
|
||||||
|
s.Failed++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s satStats) Overall() string {
|
||||||
|
if s.Failed > 0 {
|
||||||
|
return "FAILED"
|
||||||
|
}
|
||||||
|
if s.Unsupported > 0 {
|
||||||
|
return "PARTIAL"
|
||||||
|
}
|
||||||
|
return "OK"
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSATStats(summary *strings.Builder, stats satStats) {
|
||||||
|
fmt.Fprintf(summary, "overall_status=%s\n", stats.Overall())
|
||||||
|
fmt.Fprintf(summary, "job_ok=%d\n", stats.OK)
|
||||||
|
fmt.Fprintf(summary, "job_failed=%d\n", stats.Failed)
|
||||||
|
fmt.Fprintf(summary, "job_unsupported=%d\n", stats.Unsupported)
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||||
|
rc := 0
|
||||||
|
if err != nil {
|
||||||
|
rc = 1
|
||||||
|
}
|
||||||
|
if err == nil {
|
||||||
|
return "OK", rc
|
||||||
|
}
|
||||||
|
|
||||||
|
text := strings.ToLower(string(out))
|
||||||
|
if strings.Contains(text, "unsupported") ||
|
||||||
|
strings.Contains(text, "not supported") ||
|
||||||
|
strings.Contains(text, "invalid opcode") ||
|
||||||
|
strings.Contains(text, "unknown command") ||
|
||||||
|
strings.Contains(text, "not implemented") ||
|
||||||
|
strings.Contains(text, "not available") ||
|
||||||
|
strings.Contains(text, "cuda_error_system_not_ready") ||
|
||||||
|
strings.Contains(text, "no such device") ||
|
||||||
|
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||||
|
return "UNSUPPORTED", rc
|
||||||
|
}
|
||||||
|
return "FAILED", rc
|
||||||
|
}
|
||||||
|
|
||||||
|
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||||
|
start := time.Now().UTC()
|
||||||
|
resolvedCmd, err := resolveSATCommand(cmd)
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||||
|
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||||
|
"rc: 1",
|
||||||
|
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
return []byte(err.Error() + "\n"), err
|
||||||
|
}
|
||||||
|
|
||||||
|
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
|
||||||
|
|
||||||
|
rc := 0
|
||||||
|
if err != nil {
|
||||||
|
rc = 1
|
||||||
|
}
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||||
|
fmt.Sprintf("rc: %d", rc),
|
||||||
|
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func runROCmSMI(args ...string) ([]byte, error) {
|
||||||
|
cmd, err := resolveROCmSMICommand(args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveSATCommand(cmd []string) ([]string, error) {
|
||||||
|
if len(cmd) == 0 {
|
||||||
|
return nil, errors.New("empty SAT command")
|
||||||
|
}
|
||||||
|
if cmd[0] != "rocm-smi" {
|
||||||
|
return cmd, nil
|
||||||
|
}
|
||||||
|
return resolveROCmSMICommand(cmd[1:]...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||||
|
if path, err := satLookPath("rocm-smi"); err == nil {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, path := range rocmSMIExecutableCandidates() {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
pythonPath, pyErr := satLookPath("python3")
|
||||||
|
if pyErr == nil {
|
||||||
|
for _, script := range rocmSMIScriptCandidates() {
|
||||||
|
cmd := []string{pythonPath, script}
|
||||||
|
cmd = append(cmd, args...)
|
||||||
|
return cmd, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||||
|
}
|
||||||
|
|
||||||
|
func rocmSMIExecutableCandidates() []string {
|
||||||
|
return expandExistingPaths(rocmSMIExecutableGlobs)
|
||||||
|
}
|
||||||
|
|
||||||
|
func rocmSMIScriptCandidates() []string {
|
||||||
|
return expandExistingPaths(rocmSMIScriptGlobs)
|
||||||
|
}
|
||||||
|
|
||||||
|
func expandExistingPaths(patterns []string) []string {
|
||||||
|
seen := make(map[string]struct{})
|
||||||
|
var paths []string
|
||||||
|
for _, pattern := range patterns {
|
||||||
|
matches, err := satGlob(pattern)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
for _, match := range matches {
|
||||||
|
if _, err := satStat(match); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[match]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[match] = struct{}{}
|
||||||
|
paths = append(paths, match)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return paths
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseStorageDevices(raw string) []string {
|
||||||
|
var devices []string
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
|
fields := strings.Fields(strings.TrimSpace(line))
|
||||||
|
if len(fields) < 2 || fields[1] != "disk" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devices = append(devices, "/dev/"+fields[0])
|
||||||
|
}
|
||||||
|
return devices
|
||||||
|
}
|
||||||
|
|
||||||
|
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||||||
|
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||||||
|
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
||||||
|
stopCh := make(chan struct{})
|
||||||
|
doneCh := make(chan struct{})
|
||||||
|
var metricRows []GPUMetricRow
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer close(doneCh)
|
||||||
|
ticker := time.NewTicker(time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
samples, err := sampleGPUMetrics(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
elapsed := time.Since(start).Seconds()
|
||||||
|
for i := range samples {
|
||||||
|
samples[i].ElapsedSec = elapsed
|
||||||
|
}
|
||||||
|
metricRows = append(metricRows, samples...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env)
|
||||||
|
|
||||||
|
close(stopCh)
|
||||||
|
<-doneCh
|
||||||
|
|
||||||
|
if len(metricRows) > 0 {
|
||||||
|
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
||||||
|
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
||||||
|
chart := RenderGPUTerminalChart(metricRows)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func appendSATVerboseLog(path string, lines ...string) {
|
||||||
|
if path == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
for _, line := range lines {
|
||||||
|
_, _ = io.WriteString(f, line+"\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func envInt(name string, fallback int) int {
|
||||||
|
raw := strings.TrimSpace(os.Getenv(name))
|
||||||
|
if raw == "" {
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
value, err := strconv.Atoi(raw)
|
||||||
|
if err != nil || value <= 0 {
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
func createTarGz(dst, srcDir string) error {
|
||||||
|
file, err := os.Create(dst)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
gz := gzip.NewWriter(file)
|
||||||
|
defer gz.Close()
|
||||||
|
|
||||||
|
tw := tar.NewWriter(gz)
|
||||||
|
defer tw.Close()
|
||||||
|
|
||||||
|
base := filepath.Dir(srcDir)
|
||||||
|
return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
header, err := tar.FileInfoHeader(info, "")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(base, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
header.Name = rel
|
||||||
|
if err := tw.WriteHeader(header); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
file, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
_, err = io.Copy(tw, file)
|
||||||
|
return err
|
||||||
|
})
|
||||||
|
}
|
||||||
587
audit/internal/platform/sat_fan_stress.go
Normal file
587
audit/internal/platform/sat_fan_stress.go
Normal file
@@ -0,0 +1,587 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// FanStressOptions configures the fan-stress / thermal cycling test.
|
||||||
|
type FanStressOptions struct {
|
||||||
|
BaselineSec int // idle monitoring before and after load (default 30)
|
||||||
|
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||||
|
PauseSec int // pause between the two load phases (default 60)
|
||||||
|
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||||
|
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
||||||
|
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||||
|
}
|
||||||
|
|
||||||
|
// FanReading holds one fan sensor reading.
|
||||||
|
type FanReading struct {
|
||||||
|
Name string
|
||||||
|
RPM float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPUStressMetric holds per-GPU metrics during the stress test.
|
||||||
|
type GPUStressMetric struct {
|
||||||
|
Index int
|
||||||
|
TempC float64
|
||||||
|
UsagePct float64
|
||||||
|
PowerW float64
|
||||||
|
ClockMHz float64
|
||||||
|
Throttled bool // true if any throttle reason is active
|
||||||
|
}
|
||||||
|
|
||||||
|
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||||
|
type FanStressRow struct {
|
||||||
|
TimestampUTC string
|
||||||
|
ElapsedSec float64
|
||||||
|
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||||
|
GPUs []GPUStressMetric
|
||||||
|
Fans []FanReading
|
||||||
|
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||||
|
SysPowerW float64 // DCMI system power reading
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||||
|
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||||
|
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||||
|
func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanStressOptions) (string, error) {
|
||||||
|
if baseDir == "" {
|
||||||
|
baseDir = "/var/log/bee-sat"
|
||||||
|
}
|
||||||
|
applyFanStressDefaults(&opts)
|
||||||
|
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "fan-stress-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
|
|
||||||
|
// Phase name shared between sampler goroutine and main goroutine.
|
||||||
|
var phaseMu sync.Mutex
|
||||||
|
currentPhase := "init"
|
||||||
|
setPhase := func(name string) {
|
||||||
|
phaseMu.Lock()
|
||||||
|
currentPhase = name
|
||||||
|
phaseMu.Unlock()
|
||||||
|
}
|
||||||
|
getPhase := func() string {
|
||||||
|
phaseMu.Lock()
|
||||||
|
defer phaseMu.Unlock()
|
||||||
|
return currentPhase
|
||||||
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
var rowsMu sync.Mutex
|
||||||
|
var allRows []FanStressRow
|
||||||
|
|
||||||
|
// Start background sampler (every second).
|
||||||
|
stopCh := make(chan struct{})
|
||||||
|
doneCh := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
defer close(doneCh)
|
||||||
|
ticker := time.NewTicker(time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
row := sampleFanStressRow(opts.GPUIndices, getPhase(), time.Since(start).Seconds())
|
||||||
|
rowsMu.Lock()
|
||||||
|
allRows = append(allRows, row)
|
||||||
|
rowsMu.Unlock()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
var summary strings.Builder
|
||||||
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
|
|
||||||
|
stats := satStats{}
|
||||||
|
|
||||||
|
// idlePhase sleeps for durSec while the sampler stamps phaseName on each row.
|
||||||
|
idlePhase := func(phaseName, stepName string, durSec int) {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
setPhase(phaseName)
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] start %s (idle %ds)", time.Now().UTC().Format(time.RFC3339), stepName, durSec),
|
||||||
|
)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
case <-time.After(time.Duration(durSec) * time.Second):
|
||||||
|
}
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), stepName),
|
||||||
|
)
|
||||||
|
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||||
|
stats.OK++
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
|
||||||
|
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
setPhase(phaseName)
|
||||||
|
var env []string
|
||||||
|
if len(opts.GPUIndices) > 0 {
|
||||||
|
ids := make([]string, len(opts.GPUIndices))
|
||||||
|
for i, idx := range opts.GPUIndices {
|
||||||
|
ids[i] = strconv.Itoa(idx)
|
||||||
|
}
|
||||||
|
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
||||||
|
}
|
||||||
|
cmd := []string{
|
||||||
|
"bee-gpu-stress",
|
||||||
|
"--seconds", strconv.Itoa(durSec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
}
|
||||||
|
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||||
|
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||||
|
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||||
|
stats.Failed++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||||
|
stats.OK++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute test phases.
|
||||||
|
idlePhase("baseline", "01-baseline", opts.BaselineSec)
|
||||||
|
loadPhase("load1", "02-load1", opts.Phase1DurSec)
|
||||||
|
idlePhase("pause", "03-pause", opts.PauseSec)
|
||||||
|
loadPhase("load2", "04-load2", opts.Phase2DurSec)
|
||||||
|
idlePhase("cooldown", "05-cooldown", opts.BaselineSec)
|
||||||
|
|
||||||
|
// Stop sampler and collect rows.
|
||||||
|
close(stopCh)
|
||||||
|
<-doneCh
|
||||||
|
|
||||||
|
rowsMu.Lock()
|
||||||
|
rows := allRows
|
||||||
|
rowsMu.Unlock()
|
||||||
|
|
||||||
|
// Analysis.
|
||||||
|
throttled := analyzeThrottling(rows)
|
||||||
|
maxGPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||||
|
var m float64
|
||||||
|
for _, g := range r.GPUs {
|
||||||
|
if g.TempC > m {
|
||||||
|
m = g.TempC
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return m
|
||||||
|
})
|
||||||
|
maxCPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||||
|
return r.CPUMaxTempC
|
||||||
|
})
|
||||||
|
fanResponseSec := analyzeFanResponse(rows)
|
||||||
|
|
||||||
|
fmt.Fprintf(&summary, "throttling_detected=%v\n", throttled)
|
||||||
|
fmt.Fprintf(&summary, "max_gpu_temp_c=%.1f\n", maxGPUTemp)
|
||||||
|
fmt.Fprintf(&summary, "max_cpu_temp_c=%.1f\n", maxCPUTemp)
|
||||||
|
if fanResponseSec >= 0 {
|
||||||
|
fmt.Fprintf(&summary, "fan_response_sec=%.1f\n", fanResponseSec)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&summary, "fan_response_sec=N/A\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Throttling failure counts against overall result.
|
||||||
|
if throttled {
|
||||||
|
stats.Failed++
|
||||||
|
}
|
||||||
|
writeSATStats(&summary, stats)
|
||||||
|
|
||||||
|
// Write CSV outputs.
|
||||||
|
if err := WriteFanStressCSV(filepath.Join(runDir, "metrics.csv"), rows, opts.GPUIndices); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
_ = WriteFanSensorsCSV(filepath.Join(runDir, "fan-sensors.csv"), rows)
|
||||||
|
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
||||||
|
if err := createTarGz(archive, runDir); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return archive, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||||
|
if opts.BaselineSec <= 0 {
|
||||||
|
opts.BaselineSec = 30
|
||||||
|
}
|
||||||
|
if opts.Phase1DurSec <= 0 {
|
||||||
|
opts.Phase1DurSec = 300
|
||||||
|
}
|
||||||
|
if opts.PauseSec <= 0 {
|
||||||
|
opts.PauseSec = 60
|
||||||
|
}
|
||||||
|
if opts.Phase2DurSec <= 0 {
|
||||||
|
opts.Phase2DurSec = 300
|
||||||
|
}
|
||||||
|
if opts.SizeMB <= 0 {
|
||||||
|
opts.SizeMB = 64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||||
|
func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStressRow {
|
||||||
|
row := FanStressRow{
|
||||||
|
TimestampUTC: time.Now().UTC().Format(time.RFC3339),
|
||||||
|
ElapsedSec: elapsed,
|
||||||
|
Phase: phase,
|
||||||
|
}
|
||||||
|
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||||
|
row.Fans, _ = sampleFanSpeeds()
|
||||||
|
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||||
|
row.SysPowerW = sampleSystemPower()
|
||||||
|
return row
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleGPUStressMetrics queries nvidia-smi for temperature, utilization, power,
|
||||||
|
// clock frequency, and active throttle reasons for each GPU.
|
||||||
|
func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
|
||||||
|
args := []string{
|
||||||
|
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics,clocks_throttle_reasons.active",
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
}
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
ids := make([]string, len(gpuIndices))
|
||||||
|
for i, idx := range gpuIndices {
|
||||||
|
ids[i] = strconv.Itoa(idx)
|
||||||
|
}
|
||||||
|
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
||||||
|
}
|
||||||
|
out, err := exec.Command("nvidia-smi", args...).Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var metrics []GPUStressMetric
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ", ")
|
||||||
|
if len(parts) < 6 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
throttleVal := strings.TrimSpace(parts[5])
|
||||||
|
// Throttled if active reasons bitmask is non-zero.
|
||||||
|
throttled := throttleVal != "0x0000000000000000" &&
|
||||||
|
throttleVal != "0x0" &&
|
||||||
|
throttleVal != "0" &&
|
||||||
|
throttleVal != "" &&
|
||||||
|
throttleVal != "N/A"
|
||||||
|
metrics = append(metrics, GPUStressMetric{
|
||||||
|
Index: idx,
|
||||||
|
TempC: parseGPUFloat(parts[1]),
|
||||||
|
UsagePct: parseGPUFloat(parts[2]),
|
||||||
|
PowerW: parseGPUFloat(parts[3]),
|
||||||
|
ClockMHz: parseGPUFloat(parts[4]),
|
||||||
|
Throttled: throttled,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
||||||
|
func sampleFanSpeeds() ([]FanReading, error) {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return parseFanSpeeds(string(out)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||||
|
// Line format: "FAN1 | 2400.000 | RPM | ok"
|
||||||
|
func parseFanSpeeds(raw string) []FanReading {
|
||||||
|
var fans []FanReading
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
unit := strings.TrimSpace(parts[2])
|
||||||
|
if !strings.EqualFold(unit, "RPM") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
valStr := strings.TrimSpace(parts[1])
|
||||||
|
if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
val, err := strconv.ParseFloat(valStr, 64)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fans = append(fans, FanReading{
|
||||||
|
Name: strings.TrimSpace(parts[0]),
|
||||||
|
RPM: val,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return fans
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
||||||
|
func sampleCPUMaxTemp() float64 {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||||
|
if err != nil {
|
||||||
|
return sampleCPUTempViaSensors()
|
||||||
|
}
|
||||||
|
return parseIPMIMaxTemp(string(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseIPMIMaxTemp extracts the maximum temperature from "ipmitool sdr type Temperature".
|
||||||
|
func parseIPMIMaxTemp(raw string) float64 {
|
||||||
|
var max float64
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
unit := strings.TrimSpace(parts[2])
|
||||||
|
if !strings.Contains(strings.ToLower(unit), "degrees") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
valStr := strings.TrimSpace(parts[1])
|
||||||
|
if strings.EqualFold(valStr, "na") || valStr == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
val, err := strconv.ParseFloat(valStr, 64)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if val > max {
|
||||||
|
max = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleCPUTempViaSensors falls back to lm-sensors when ipmitool is unavailable.
|
||||||
|
func sampleCPUTempViaSensors() float64 {
|
||||||
|
out, err := exec.Command("sensors", "-u").Output()
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
var max float64
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !strings.HasSuffix(fields[0], "_input:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
val, err := strconv.ParseFloat(fields[1], 64)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if val > 0 && val < 150 && val > max {
|
||||||
|
max = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleSystemPower reads system power draw via DCMI.
|
||||||
|
func sampleSystemPower() float64 {
|
||||||
|
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return parseDCMIPowerReading(string(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||||
|
// Sample: " Instantaneous power reading: 500 Watts"
|
||||||
|
func parseDCMIPowerReading(raw string) float64 {
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
if !strings.Contains(strings.ToLower(line), "instantaneous") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Fields(line)
|
||||||
|
for i, p := range parts {
|
||||||
|
if strings.EqualFold(p, "Watts") && i > 0 {
|
||||||
|
val, err := strconv.ParseFloat(parts[i-1], 64)
|
||||||
|
if err == nil {
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||||
|
// during either load phase.
|
||||||
|
func analyzeThrottling(rows []FanStressRow) bool {
|
||||||
|
for _, row := range rows {
|
||||||
|
if row.Phase != "load1" && row.Phase != "load2" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, gpu := range row.GPUs {
|
||||||
|
if gpu.Throttled {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// analyzeMaxTemp returns the maximum value of the given extractor across all rows.
|
||||||
|
func analyzeMaxTemp(rows []FanStressRow, extract func(FanStressRow) float64) float64 {
|
||||||
|
var max float64
|
||||||
|
for _, row := range rows {
|
||||||
|
if v := extract(row); v > max {
|
||||||
|
max = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max
|
||||||
|
}
|
||||||
|
|
||||||
|
// analyzeFanResponse returns the seconds from load1 start until fan RPM first
|
||||||
|
// increased by more than 5% above the baseline average. Returns -1 if undetermined.
|
||||||
|
func analyzeFanResponse(rows []FanStressRow) float64 {
|
||||||
|
// Compute baseline average fan RPM.
|
||||||
|
var baseTotal, baseCount float64
|
||||||
|
for _, row := range rows {
|
||||||
|
if row.Phase != "baseline" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, f := range row.Fans {
|
||||||
|
baseTotal += f.RPM
|
||||||
|
baseCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if baseCount == 0 || baseTotal == 0 {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
baseAvg := baseTotal / baseCount
|
||||||
|
threshold := baseAvg * 1.05 // 5% increase signals fan ramp-up
|
||||||
|
|
||||||
|
// Find elapsed time when load1 started.
|
||||||
|
var load1Start float64 = -1
|
||||||
|
for _, row := range rows {
|
||||||
|
if row.Phase == "load1" {
|
||||||
|
load1Start = row.ElapsedSec
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if load1Start < 0 {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find first load1 row where average RPM crosses the threshold.
|
||||||
|
for _, row := range rows {
|
||||||
|
if row.Phase != "load1" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var total, count float64
|
||||||
|
for _, f := range row.Fans {
|
||||||
|
total += f.RPM
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
if count > 0 && total/count >= threshold {
|
||||||
|
return row.ElapsedSec - load1Start
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteFanStressCSV writes the wide-format metrics CSV with one row per second.
|
||||||
|
// GPU columns are generated per index in gpuIndices order.
|
||||||
|
func WriteFanStressCSV(path string, rows []FanStressRow, gpuIndices []int) error {
|
||||||
|
if len(rows) == 0 {
|
||||||
|
return os.WriteFile(path, []byte("no data\n"), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
|
||||||
|
// Header: fixed system columns + per-GPU columns.
|
||||||
|
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_avg_rpm,fan_min_rpm,fan_max_rpm,cpu_max_temp_c,sys_power_w")
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
fmt.Fprintf(&b, ",gpu%d_temp_c,gpu%d_usage_pct,gpu%d_power_w,gpu%d_clock_mhz,gpu%d_throttled",
|
||||||
|
idx, idx, idx, idx, idx)
|
||||||
|
}
|
||||||
|
b.WriteRune('\n')
|
||||||
|
|
||||||
|
for _, row := range rows {
|
||||||
|
favg, fmin, fmax := fanRPMStats(row.Fans)
|
||||||
|
fmt.Fprintf(&b, "%s,%.1f,%s,%.0f,%.0f,%.0f,%.1f,%.1f",
|
||||||
|
row.TimestampUTC,
|
||||||
|
row.ElapsedSec,
|
||||||
|
row.Phase,
|
||||||
|
favg, fmin, fmax,
|
||||||
|
row.CPUMaxTempC,
|
||||||
|
row.SysPowerW,
|
||||||
|
)
|
||||||
|
gpuByIdx := make(map[int]GPUStressMetric, len(row.GPUs))
|
||||||
|
for _, g := range row.GPUs {
|
||||||
|
gpuByIdx[g.Index] = g
|
||||||
|
}
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
g := gpuByIdx[idx]
|
||||||
|
throttled := 0
|
||||||
|
if g.Throttled {
|
||||||
|
throttled = 1
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, ",%.1f,%.1f,%.1f,%.0f,%d",
|
||||||
|
g.TempC, g.UsagePct, g.PowerW, g.ClockMHz, throttled)
|
||||||
|
}
|
||||||
|
b.WriteRune('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteFanSensorsCSV writes individual fan sensor readings in long (tidy) format.
|
||||||
|
func WriteFanSensorsCSV(path string, rows []FanStressRow) error {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_name,rpm\n")
|
||||||
|
for _, row := range rows {
|
||||||
|
for _, f := range row.Fans {
|
||||||
|
fmt.Fprintf(&b, "%s,%.1f,%s,%s,%.0f\n",
|
||||||
|
row.TimestampUTC, row.ElapsedSec, row.Phase, f.Name, f.RPM)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
// fanRPMStats computes average, min, max RPM across all fans in a sample row.
|
||||||
|
func fanRPMStats(fans []FanReading) (avg, min, max float64) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return 0, 0, 0
|
||||||
|
}
|
||||||
|
min = fans[0].RPM
|
||||||
|
max = fans[0].RPM
|
||||||
|
var total float64
|
||||||
|
for _, f := range fans {
|
||||||
|
total += f.RPM
|
||||||
|
if f.RPM < min {
|
||||||
|
min = f.RPM
|
||||||
|
}
|
||||||
|
if f.RPM > max {
|
||||||
|
max = f.RPM
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total / float64(len(fans)), min, max
|
||||||
|
}
|
||||||
182
audit/internal/platform/sat_test.go
Normal file
182
audit/internal/platform/sat_test.go
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestStorageSATCommands(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
nvme := storageSATCommands("/dev/nvme0n1")
|
||||||
|
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||||
|
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||||
|
}
|
||||||
|
|
||||||
|
sata := storageSATCommands("/dev/sda")
|
||||||
|
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||||
|
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
jobs := nvidiaSATJobs()
|
||||||
|
|
||||||
|
if len(jobs) != 5 {
|
||||||
|
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||||
|
}
|
||||||
|
if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
|
||||||
|
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
|
||||||
|
}
|
||||||
|
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||||
|
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||||
|
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||||
|
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||||
|
|
||||||
|
jobs := nvidiaSATJobs()
|
||||||
|
got := jobs[4].cmd
|
||||||
|
want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnvIntFallback(t *testing.T) {
|
||||||
|
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||||
|
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||||
|
t.Fatalf("got %d want 123", got)
|
||||||
|
}
|
||||||
|
t.Setenv("BEE_MEMTESTER_SIZE_MB", "bad")
|
||||||
|
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||||
|
t.Fatalf("got %d want 123", got)
|
||||||
|
}
|
||||||
|
t.Setenv("BEE_MEMTESTER_SIZE_MB", "256")
|
||||||
|
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 256 {
|
||||||
|
t.Fatalf("got %d want 256", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifySATResult(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
job string
|
||||||
|
out string
|
||||||
|
err error
|
||||||
|
status string
|
||||||
|
}{
|
||||||
|
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||||
|
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
|
{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||||
|
{name: "cuda not ready", job: "bee-gpu-stress", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
got, _ := classifySATResult(tt.job, []byte(tt.out), tt.err)
|
||||||
|
if got != tt.status {
|
||||||
|
t.Fatalf("status=%q want %q", got, tt.status)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
|
||||||
|
got := parseStorageDevices(raw)
|
||||||
|
want := []string{"/dev/nvme0n1", "/dev/sdb"}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
||||||
|
t.Setenv("PATH", t.TempDir())
|
||||||
|
|
||||||
|
toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
|
||||||
|
if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||||
|
t.Fatalf("write rocm-smi: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd, err := resolveROCmSMICommand("--showproductname")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cmd) != 2 {
|
||||||
|
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||||
|
}
|
||||||
|
if cmd[0] != toolPath {
|
||||||
|
t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||||
|
if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||||
|
t.Fatalf("write rocm-smi: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
oldGlob := rocmSMIExecutableGlobs
|
||||||
|
oldScriptGlobs := rocmSMIScriptGlobs
|
||||||
|
rocmSMIExecutableGlobs = []string{execPath}
|
||||||
|
rocmSMIScriptGlobs = nil
|
||||||
|
t.Cleanup(func() {
|
||||||
|
rocmSMIExecutableGlobs = oldGlob
|
||||||
|
rocmSMIScriptGlobs = oldScriptGlobs
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Setenv("PATH", "")
|
||||||
|
|
||||||
|
cmd, err := resolveROCmSMICommand("--showallinfo")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cmd) != 2 {
|
||||||
|
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||||
|
}
|
||||||
|
if cmd[0] != execPath {
|
||||||
|
t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
oldExecGlobs := rocmSMIExecutableGlobs
|
||||||
|
oldScriptGlobs := rocmSMIScriptGlobs
|
||||||
|
satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
|
||||||
|
rocmSMIExecutableGlobs = nil
|
||||||
|
rocmSMIScriptGlobs = nil
|
||||||
|
t.Cleanup(func() {
|
||||||
|
satLookPath = oldLookPath
|
||||||
|
rocmSMIExecutableGlobs = oldExecGlobs
|
||||||
|
rocmSMIScriptGlobs = oldScriptGlobs
|
||||||
|
})
|
||||||
|
|
||||||
|
if _, err := runROCmSMI("--showproductname"); err == nil {
|
||||||
|
t.Fatal("expected missing rocm-smi error")
|
||||||
|
}
|
||||||
|
}
|
||||||
54
audit/internal/platform/services.go
Normal file
54
audit/internal/platform/services.go
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *System) ListBeeServices() ([]string, error) {
|
||||||
|
seen := map[string]bool{}
|
||||||
|
var out []string
|
||||||
|
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
|
||||||
|
matches, err := filepath.Glob(pattern)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
for _, match := range matches {
|
||||||
|
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
||||||
|
if !seen[name] {
|
||||||
|
seen[name] = true
|
||||||
|
out = append(out, name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Strings(out)
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) ServiceState(name string) string {
|
||||||
|
raw, err := exec.Command("systemctl", "is-active", name).CombinedOutput()
|
||||||
|
if err == nil {
|
||||||
|
return strings.TrimSpace(string(raw))
|
||||||
|
}
|
||||||
|
raw, err = exec.Command("systemctl", "show", name, "--property=ActiveState", "--value").CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
state := strings.TrimSpace(string(raw))
|
||||||
|
if state == "" {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return state
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
|
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) ServiceStatus(name string) (string, error) {
|
||||||
|
raw, err := exec.Command("systemctl", "status", name, "--no-pager").CombinedOutput()
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
49
audit/internal/platform/system_test.go
Normal file
49
audit/internal/platform/system_test.go
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestSplitQuotedFields(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
line := `NAME="sdb1" TYPE="part" LABEL="BEE EXPORT" MODEL="USB DISK 3.0"`
|
||||||
|
got := splitQuotedFields(line)
|
||||||
|
want := []string{
|
||||||
|
`NAME="sdb1"`,
|
||||||
|
`TYPE="part"`,
|
||||||
|
`LABEL="BEE EXPORT"`,
|
||||||
|
`MODEL="USB DISK 3.0"`,
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len(got)=%d len(want)=%d; got=%q", len(got), len(want), got)
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("got[%d]=%q want %q", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseLSBLKPairs(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
line := `NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" FSTYPE="vfat" MOUNTPOINT="" SIZE="57.3G" LABEL="BEE EXPORT" MODEL="USB DISK 3.0"`
|
||||||
|
got := parseLSBLKPairs(line)
|
||||||
|
|
||||||
|
checks := map[string]string{
|
||||||
|
"NAME": "sdb1",
|
||||||
|
"TYPE": "part",
|
||||||
|
"PKNAME": "sdb",
|
||||||
|
"RM": "1",
|
||||||
|
"FSTYPE": "vfat",
|
||||||
|
"MOUNTPOINT": "",
|
||||||
|
"SIZE": "57.3G",
|
||||||
|
"LABEL": "BEE EXPORT",
|
||||||
|
"MODEL": "USB DISK 3.0",
|
||||||
|
}
|
||||||
|
for key, want := range checks {
|
||||||
|
if got[key] != want {
|
||||||
|
t.Fatalf("got[%s]=%q want %q", key, got[key], want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
150
audit/internal/platform/techdump.go
Normal file
150
audit/internal/platform/techdump.go
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var techDumpFixedCommands = []struct {
|
||||||
|
Name string
|
||||||
|
Args []string
|
||||||
|
File string
|
||||||
|
}{
|
||||||
|
{Name: "dmidecode", Args: []string{"-t", "0"}, File: "dmidecode-type0.txt"},
|
||||||
|
{Name: "dmidecode", Args: []string{"-t", "1"}, File: "dmidecode-type1.txt"},
|
||||||
|
{Name: "dmidecode", Args: []string{"-t", "2"}, File: "dmidecode-type2.txt"},
|
||||||
|
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
||||||
|
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
||||||
|
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
||||||
|
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
||||||
|
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||||
|
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||||
|
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||||
|
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||||
|
}
|
||||||
|
|
||||||
|
var techDumpNvidiaCommands = []struct {
|
||||||
|
Name string
|
||||||
|
Args []string
|
||||||
|
File string
|
||||||
|
}{
|
||||||
|
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
|
||||||
|
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
|
||||||
|
}
|
||||||
|
|
||||||
|
type lsblkDumpRoot struct {
|
||||||
|
Blockdevices []struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Type string `json:"type"`
|
||||||
|
Tran string `json:"tran"`
|
||||||
|
} `json:"blockdevices"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvmeDumpRoot struct {
|
||||||
|
Devices []struct {
|
||||||
|
DevicePath string `json:"DevicePath"`
|
||||||
|
} `json:"Devices"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) CaptureTechnicalDump(baseDir string) error {
|
||||||
|
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, cmd := range techDumpFixedCommands {
|
||||||
|
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||||
|
}
|
||||||
|
switch s.DetectGPUVendor() {
|
||||||
|
case "nvidia":
|
||||||
|
for _, cmd := range techDumpNvidiaCommands {
|
||||||
|
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||||
|
}
|
||||||
|
case "amd":
|
||||||
|
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
|
||||||
|
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
|
||||||
|
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
|
||||||
|
}
|
||||||
|
for _, dev := range nvmeDumpDevices(filepath.Join(baseDir, "nvme-list.json")) {
|
||||||
|
writeCommandDump(filepath.Join(baseDir, "nvme-id-ctrl-"+sanitizeDumpName(dev)+".json"), "nvme", "id-ctrl", dev, "-o", "json")
|
||||||
|
writeCommandDump(filepath.Join(baseDir, "nvme-smart-log-"+sanitizeDumpName(dev)+".json"), "nvme", "smart-log", dev, "-o", "json")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeCommandDump(path, name string, args ...string) {
|
||||||
|
out, err := exec.Command(name, args...).CombinedOutput()
|
||||||
|
if err != nil && len(out) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(path, out, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeROCmSMIDump(path string, args ...string) {
|
||||||
|
out, err := runROCmSMI(args...)
|
||||||
|
if err != nil && len(out) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(path, out, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func lsblkDumpDevices(path string) []string {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var root lsblkDumpRoot
|
||||||
|
if err := json.Unmarshal(raw, &root); err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var devices []string
|
||||||
|
for _, dev := range root.Blockdevices {
|
||||||
|
if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
|
||||||
|
devices = append(devices, strings.TrimSpace(dev.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Strings(devices)
|
||||||
|
return devices
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvmeDumpDevices(path string) []string {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var root nvmeDumpRoot
|
||||||
|
if err := json.Unmarshal(raw, &root); err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
seen := map[string]bool{}
|
||||||
|
var devices []string
|
||||||
|
for _, dev := range root.Devices {
|
||||||
|
name := strings.TrimSpace(dev.DevicePath)
|
||||||
|
if name == "" || seen[name] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[name] = true
|
||||||
|
devices = append(devices, name)
|
||||||
|
}
|
||||||
|
sort.Strings(devices)
|
||||||
|
return devices
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeDumpName(value string) string {
|
||||||
|
value = strings.TrimSpace(value)
|
||||||
|
value = strings.TrimPrefix(value, "/dev/")
|
||||||
|
value = strings.ReplaceAll(value, "/", "_")
|
||||||
|
if value == "" {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return value
|
||||||
|
}
|
||||||
48
audit/internal/platform/techdump_test.go
Normal file
48
audit/internal/platform/techdump_test.go
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLSBLKDumpDevices(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "lsblk.json")
|
||||||
|
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
|
||||||
|
t.Fatalf("write lsblk fixture: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
got := lsblkDumpDevices(path)
|
||||||
|
want := []string{"nvme0n1", "sdb"}
|
||||||
|
if !reflect.DeepEqual(got, want) {
|
||||||
|
t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNVMEDumpDevices(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "nvme-list.json")
|
||||||
|
if err := os.WriteFile(path, []byte(`{"Devices":[{"DevicePath":"/dev/nvme1n1"},{"DevicePath":"/dev/nvme0n1"},{"DevicePath":"/dev/nvme1n1"}]}`), 0644); err != nil {
|
||||||
|
t.Fatalf("write nvme fixture: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
got := nvmeDumpDevices(path)
|
||||||
|
want := []string{"/dev/nvme0n1", "/dev/nvme1n1"}
|
||||||
|
if !reflect.DeepEqual(got, want) {
|
||||||
|
t.Fatalf("nvmeDumpDevices=%v want %v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSanitizeDumpName(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
if got := sanitizeDumpName("/dev/nvme0n1"); got != "nvme0n1" {
|
||||||
|
t.Fatalf("sanitizeDumpName=%q want nvme0n1", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
29
audit/internal/platform/tools.go
Normal file
29
audit/internal/platform/tools.go
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *System) TailFile(path string, lines int) string {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Sprintf("read %s: %v", path, err)
|
||||||
|
}
|
||||||
|
all := strings.Split(strings.TrimRight(string(raw), "\n"), "\n")
|
||||||
|
if lines <= 0 || len(all) <= lines {
|
||||||
|
return string(raw)
|
||||||
|
}
|
||||||
|
return strings.Join(all[len(all)-lines:], "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) CheckTools(names []string) []ToolStatus {
|
||||||
|
out := make([]ToolStatus, 0, len(names))
|
||||||
|
for _, name := range names {
|
||||||
|
path, err := exec.LookPath(name)
|
||||||
|
out = append(out, ToolStatus{Name: name, Path: path, OK: err == nil})
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
44
audit/internal/platform/types.go
Normal file
44
audit/internal/platform/types.go
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
type System struct{}
|
||||||
|
|
||||||
|
type InterfaceInfo struct {
|
||||||
|
Name string
|
||||||
|
State string
|
||||||
|
IPv4 []string
|
||||||
|
}
|
||||||
|
|
||||||
|
type ServiceAction string
|
||||||
|
|
||||||
|
const (
|
||||||
|
ServiceStart ServiceAction = "start"
|
||||||
|
ServiceStop ServiceAction = "stop"
|
||||||
|
ServiceRestart ServiceAction = "restart"
|
||||||
|
)
|
||||||
|
|
||||||
|
type StaticIPv4Config struct {
|
||||||
|
Interface string
|
||||||
|
Address string
|
||||||
|
Prefix string
|
||||||
|
Gateway string
|
||||||
|
DNS []string
|
||||||
|
}
|
||||||
|
|
||||||
|
type RemovableTarget struct {
|
||||||
|
Device string
|
||||||
|
FSType string
|
||||||
|
Size string
|
||||||
|
Label string
|
||||||
|
Model string
|
||||||
|
Mountpoint string
|
||||||
|
}
|
||||||
|
|
||||||
|
type ToolStatus struct {
|
||||||
|
Name string
|
||||||
|
Path string
|
||||||
|
OK bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func New() *System {
|
||||||
|
return &System{}
|
||||||
|
}
|
||||||
77
audit/internal/runtimeenv/runtimeenv.go
Normal file
77
audit/internal/runtimeenv/runtimeenv.go
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
package runtimeenv
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Mode string
|
||||||
|
|
||||||
|
const (
|
||||||
|
ModeAuto Mode = "auto"
|
||||||
|
ModeLocal Mode = "local"
|
||||||
|
ModeLiveCD Mode = "livecd"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Info struct {
|
||||||
|
Mode Mode
|
||||||
|
Detected bool
|
||||||
|
Reason string
|
||||||
|
}
|
||||||
|
|
||||||
|
func ParseMode(raw string) (Mode, error) {
|
||||||
|
mode := Mode(strings.TrimSpace(strings.ToLower(raw)))
|
||||||
|
switch mode {
|
||||||
|
case "", ModeAuto:
|
||||||
|
return ModeAuto, nil
|
||||||
|
case ModeLocal, ModeLiveCD:
|
||||||
|
return mode, nil
|
||||||
|
default:
|
||||||
|
return "", fmt.Errorf("invalid runtime %q — use auto, local, or livecd", raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Detect(flagValue string) (Info, error) {
|
||||||
|
flagMode, err := ParseMode(flagValue)
|
||||||
|
if err != nil {
|
||||||
|
return Info{}, err
|
||||||
|
}
|
||||||
|
if flagMode != ModeAuto {
|
||||||
|
return Info{Mode: flagMode, Reason: "flag"}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if envMode, ok := getenvMode("BEE_RUNTIME"); ok {
|
||||||
|
return Info{Mode: envMode, Reason: "env:BEE_RUNTIME"}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if fileExists("/etc/bee-release") {
|
||||||
|
return Info{Mode: ModeLiveCD, Detected: true, Reason: "marker:/etc/bee-release"}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if data, err := os.ReadFile("/proc/cmdline"); err == nil {
|
||||||
|
cmdline := string(data)
|
||||||
|
if strings.Contains(cmdline, " boot=live") || strings.HasPrefix(cmdline, "boot=live ") || strings.Contains(cmdline, "live-media") {
|
||||||
|
return Info{Mode: ModeLiveCD, Detected: true, Reason: "kernel:boot=live"}, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Info{Mode: ModeLocal, Detected: true, Reason: "default:local"}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getenvMode(name string) (Mode, bool) {
|
||||||
|
value := strings.TrimSpace(os.Getenv(name))
|
||||||
|
if value == "" {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
mode, err := ParseMode(value)
|
||||||
|
if err != nil || mode == ModeAuto {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
return mode, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func fileExists(path string) bool {
|
||||||
|
info, err := os.Stat(path)
|
||||||
|
return err == nil && !info.IsDir()
|
||||||
|
}
|
||||||
67
audit/internal/runtimeenv/runtimeenv_test.go
Normal file
67
audit/internal/runtimeenv/runtimeenv_test.go
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
package runtimeenv
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseMode(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
in string
|
||||||
|
want Mode
|
||||||
|
ok bool
|
||||||
|
}{
|
||||||
|
{in: "", want: ModeAuto, ok: true},
|
||||||
|
{in: "auto", want: ModeAuto, ok: true},
|
||||||
|
{in: "local", want: ModeLocal, ok: true},
|
||||||
|
{in: "livecd", want: ModeLiveCD, ok: true},
|
||||||
|
{in: "bad", ok: false},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
got, err := ParseMode(test.in)
|
||||||
|
if test.ok && err != nil {
|
||||||
|
t.Fatalf("ParseMode(%q): %v", test.in, err)
|
||||||
|
}
|
||||||
|
if !test.ok && err == nil {
|
||||||
|
t.Fatalf("ParseMode(%q): expected error", test.in)
|
||||||
|
}
|
||||||
|
if test.ok && got != test.want {
|
||||||
|
t.Fatalf("ParseMode(%q): got %q want %q", test.in, got, test.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDetectHonorsFlag(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
info, err := Detect("livecd")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Detect(flag): %v", err)
|
||||||
|
}
|
||||||
|
if info.Mode != ModeLiveCD || info.Reason != "flag" {
|
||||||
|
t.Fatalf("unexpected info: %+v", info)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDetectHonorsEnv(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
old := os.Getenv("BEE_RUNTIME")
|
||||||
|
t.Cleanup(func() {
|
||||||
|
_ = os.Setenv("BEE_RUNTIME", old)
|
||||||
|
})
|
||||||
|
if err := os.Setenv("BEE_RUNTIME", "local"); err != nil {
|
||||||
|
t.Fatalf("Setenv: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
info, err := Detect("auto")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Detect(env): %v", err)
|
||||||
|
}
|
||||||
|
if info.Mode != ModeLocal || info.Reason != "env:BEE_RUNTIME" {
|
||||||
|
t.Fatalf("unexpected info: %+v", info)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2,17 +2,55 @@
|
|||||||
// core/internal/ingest/parser_hardware.go. No import dependency on core.
|
// core/internal/ingest/parser_hardware.go. No import dependency on core.
|
||||||
package schema
|
package schema
|
||||||
|
|
||||||
// HardwareIngestRequest is the top-level output document produced by the audit binary.
|
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
||||||
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
||||||
type HardwareIngestRequest struct {
|
type HardwareIngestRequest struct {
|
||||||
Filename *string `json:"filename"`
|
Filename *string `json:"filename,omitempty"`
|
||||||
SourceType *string `json:"source_type"`
|
SourceType *string `json:"source_type,omitempty"`
|
||||||
Protocol *string `json:"protocol"`
|
Protocol *string `json:"protocol,omitempty"`
|
||||||
TargetHost string `json:"target_host"`
|
TargetHost *string `json:"target_host,omitempty"`
|
||||||
CollectedAt string `json:"collected_at"`
|
CollectedAt string `json:"collected_at"`
|
||||||
|
Runtime *RuntimeHealth `json:"runtime,omitempty"`
|
||||||
Hardware HardwareSnapshot `json:"hardware"`
|
Hardware HardwareSnapshot `json:"hardware"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type RuntimeHealth struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
CheckedAt string `json:"checked_at"`
|
||||||
|
ExportDir string `json:"export_dir,omitempty"`
|
||||||
|
DriverReady bool `json:"driver_ready,omitempty"`
|
||||||
|
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||||
|
NetworkStatus string `json:"network_status,omitempty"`
|
||||||
|
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||||
|
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||||
|
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||||
|
Interfaces []RuntimeInterface `json:"interfaces,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type RuntimeIssue struct {
|
||||||
|
Code string `json:"code"`
|
||||||
|
Severity string `json:"severity,omitempty"`
|
||||||
|
Description string `json:"description"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type RuntimeToolStatus struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Path string `json:"path,omitempty"`
|
||||||
|
OK bool `json:"ok"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type RuntimeServiceStatus struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type RuntimeInterface struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
State string `json:"state,omitempty"`
|
||||||
|
IPv4 []string `json:"ipv4,omitempty"`
|
||||||
|
Outcome string `json:"outcome,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type HardwareSnapshot struct {
|
type HardwareSnapshot struct {
|
||||||
Board HardwareBoard `json:"board"`
|
Board HardwareBoard `json:"board"`
|
||||||
Firmware []HardwareFirmwareRecord `json:"firmware,omitempty"`
|
Firmware []HardwareFirmwareRecord `json:"firmware,omitempty"`
|
||||||
@@ -21,14 +59,33 @@ type HardwareSnapshot struct {
|
|||||||
Storage []HardwareStorage `json:"storage,omitempty"`
|
Storage []HardwareStorage `json:"storage,omitempty"`
|
||||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||||
|
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||||
|
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareHealthSummary struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
Warnings []string `json:"warnings,omitempty"`
|
||||||
|
Failures []string `json:"failures,omitempty"`
|
||||||
|
StorageWarn int `json:"storage_warn,omitempty"`
|
||||||
|
StorageFail int `json:"storage_fail,omitempty"`
|
||||||
|
PCIeWarn int `json:"pcie_warn,omitempty"`
|
||||||
|
PCIeFail int `json:"pcie_fail,omitempty"`
|
||||||
|
PSUWarn int `json:"psu_warn,omitempty"`
|
||||||
|
PSUFail int `json:"psu_fail,omitempty"`
|
||||||
|
MemoryWarn int `json:"memory_warn,omitempty"`
|
||||||
|
MemoryFail int `json:"memory_fail,omitempty"`
|
||||||
|
EmptyDIMMs int `json:"empty_dimms,omitempty"`
|
||||||
|
MissingPSUs int `json:"missing_psus,omitempty"`
|
||||||
|
CollectedAt string `json:"collected_at,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareBoard struct {
|
type HardwareBoard struct {
|
||||||
Manufacturer *string `json:"manufacturer"`
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
ProductName *string `json:"product_name"`
|
ProductName *string `json:"product_name,omitempty"`
|
||||||
SerialNumber string `json:"serial_number"`
|
SerialNumber string `json:"serial_number"`
|
||||||
PartNumber *string `json:"part_number"`
|
PartNumber *string `json:"part_number,omitempty"`
|
||||||
UUID *string `json:"uuid"`
|
UUID *string `json:"uuid,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareFirmwareRecord struct {
|
type HardwareFirmwareRecord struct {
|
||||||
@@ -37,77 +94,196 @@ type HardwareFirmwareRecord struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type HardwareCPU struct {
|
type HardwareCPU struct {
|
||||||
Socket *int `json:"socket"`
|
HardwareComponentStatus
|
||||||
Model *string `json:"model"`
|
Socket *int `json:"socket,omitempty"`
|
||||||
Manufacturer *string `json:"manufacturer"`
|
Model *string `json:"model,omitempty"`
|
||||||
Status *string `json:"status"`
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number"`
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
Firmware *string `json:"firmware"`
|
Firmware *string `json:"firmware,omitempty"`
|
||||||
Cores *int `json:"cores"`
|
Cores *int `json:"cores,omitempty"`
|
||||||
Threads *int `json:"threads"`
|
Threads *int `json:"threads,omitempty"`
|
||||||
FrequencyMHz *int `json:"frequency_mhz"`
|
FrequencyMHz *int `json:"frequency_mhz,omitempty"`
|
||||||
MaxFrequencyMHz *int `json:"max_frequency_mhz"`
|
MaxFrequencyMHz *int `json:"max_frequency_mhz,omitempty"`
|
||||||
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
|
PowerW *float64 `json:"power_w,omitempty"`
|
||||||
|
Throttled *bool `json:"throttled,omitempty"`
|
||||||
|
CorrectableErrorCount *int64 `json:"correctable_error_count,omitempty"`
|
||||||
|
UncorrectableErrorCount *int64 `json:"uncorrectable_error_count,omitempty"`
|
||||||
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
|
Present *bool `json:"present,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareMemory struct {
|
type HardwareMemory struct {
|
||||||
Slot *string `json:"slot"`
|
HardwareComponentStatus
|
||||||
Location *string `json:"location"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
Present *bool `json:"present"`
|
Location *string `json:"location,omitempty"`
|
||||||
SizeMB *int `json:"size_mb"`
|
Present *bool `json:"present,omitempty"`
|
||||||
Type *string `json:"type"`
|
SizeMB *int `json:"size_mb,omitempty"`
|
||||||
MaxSpeedMHz *int `json:"max_speed_mhz"`
|
Type *string `json:"type,omitempty"`
|
||||||
CurrentSpeedMHz *int `json:"current_speed_mhz"`
|
MaxSpeedMHz *int `json:"max_speed_mhz,omitempty"`
|
||||||
Manufacturer *string `json:"manufacturer"`
|
CurrentSpeedMHz *int `json:"current_speed_mhz,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number"`
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
PartNumber *string `json:"part_number"`
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
Status *string `json:"status"`
|
PartNumber *string `json:"part_number,omitempty"`
|
||||||
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
|
CorrectableECCErrorCount *int64 `json:"correctable_ecc_error_count,omitempty"`
|
||||||
|
UncorrectableECCErrorCount *int64 `json:"uncorrectable_ecc_error_count,omitempty"`
|
||||||
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
|
SpareBlocksRemainingPct *float64 `json:"spare_blocks_remaining_pct,omitempty"`
|
||||||
|
PerformanceDegraded *bool `json:"performance_degraded,omitempty"`
|
||||||
|
DataLossDetected *bool `json:"data_loss_detected,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareStorage struct {
|
type HardwareStorage struct {
|
||||||
Slot *string `json:"slot"`
|
HardwareComponentStatus
|
||||||
Type *string `json:"type"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
Model *string `json:"model"`
|
Type *string `json:"type,omitempty"`
|
||||||
SizeGB *int `json:"size_gb"`
|
Model *string `json:"model,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number"`
|
SizeGB *int `json:"size_gb,omitempty"`
|
||||||
Manufacturer *string `json:"manufacturer"`
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
Firmware *string `json:"firmware"`
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
Interface *string `json:"interface"`
|
Firmware *string `json:"firmware,omitempty"`
|
||||||
Present *bool `json:"present"`
|
Interface *string `json:"interface,omitempty"`
|
||||||
Status *string `json:"status"`
|
Present *bool `json:"present,omitempty"`
|
||||||
Telemetry map[string]any `json:"telemetry,omitempty"`
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
|
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||||
|
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||||
|
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||||
|
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||||
|
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||||
|
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||||
|
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||||
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
|
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||||
|
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||||
|
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||||
|
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||||
|
Telemetry map[string]any `json:"-"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwarePCIeDevice struct {
|
type HardwarePCIeDevice struct {
|
||||||
Slot *string `json:"slot"`
|
HardwareComponentStatus
|
||||||
VendorID *int `json:"vendor_id"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
DeviceID *int `json:"device_id"`
|
VendorID *int `json:"vendor_id,omitempty"`
|
||||||
BDF *string `json:"bdf"`
|
DeviceID *int `json:"device_id,omitempty"`
|
||||||
DeviceClass *string `json:"device_class"`
|
NUMANode *int `json:"numa_node,omitempty"`
|
||||||
Manufacturer *string `json:"manufacturer"`
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
Model *string `json:"model"`
|
PowerW *float64 `json:"power_w,omitempty"`
|
||||||
LinkWidth *int `json:"link_width"`
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
LinkSpeed *string `json:"link_speed"`
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
MaxLinkWidth *int `json:"max_link_width"`
|
ECCCorrectedTotal *int64 `json:"ecc_corrected_total,omitempty"`
|
||||||
MaxLinkSpeed *string `json:"max_link_speed"`
|
ECCUncorrectedTotal *int64 `json:"ecc_uncorrected_total,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number"`
|
HWSlowdown *bool `json:"hw_slowdown,omitempty"`
|
||||||
Firmware *string `json:"firmware"`
|
BatteryChargePct *float64 `json:"battery_charge_pct,omitempty"`
|
||||||
Present *bool `json:"present"`
|
BatteryHealthPct *float64 `json:"battery_health_pct,omitempty"`
|
||||||
Status *string `json:"status"`
|
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||||
Telemetry map[string]any `json:"telemetry,omitempty"`
|
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||||
|
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||||
|
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||||
|
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||||
|
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||||
|
SFPVoltageV *float64 `json:"sfp_voltage_v,omitempty"`
|
||||||
|
SFPBiasMA *float64 `json:"sfp_bias_ma,omitempty"`
|
||||||
|
BDF *string `json:"-"`
|
||||||
|
DeviceClass *string `json:"device_class,omitempty"`
|
||||||
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
|
Model *string `json:"model,omitempty"`
|
||||||
|
LinkWidth *int `json:"link_width,omitempty"`
|
||||||
|
LinkSpeed *string `json:"link_speed,omitempty"`
|
||||||
|
MaxLinkWidth *int `json:"max_link_width,omitempty"`
|
||||||
|
MaxLinkSpeed *string `json:"max_link_speed,omitempty"`
|
||||||
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
|
Firmware *string `json:"firmware,omitempty"`
|
||||||
|
MacAddresses []string `json:"mac_addresses,omitempty"`
|
||||||
|
Present *bool `json:"present,omitempty"`
|
||||||
|
Telemetry map[string]any `json:"-"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwarePowerSupply struct {
|
type HardwarePowerSupply struct {
|
||||||
Slot *string `json:"slot"`
|
HardwareComponentStatus
|
||||||
Present *bool `json:"present"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
Model *string `json:"model"`
|
Present *bool `json:"present,omitempty"`
|
||||||
Vendor *string `json:"vendor"`
|
Model *string `json:"model,omitempty"`
|
||||||
WattageW *int `json:"wattage_w"`
|
Vendor *string `json:"vendor,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number"`
|
WattageW *int `json:"wattage_w,omitempty"`
|
||||||
PartNumber *string `json:"part_number"`
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
Firmware *string `json:"firmware"`
|
PartNumber *string `json:"part_number,omitempty"`
|
||||||
Status *string `json:"status"`
|
Firmware *string `json:"firmware,omitempty"`
|
||||||
InputType *string `json:"input_type"`
|
InputType *string `json:"input_type,omitempty"`
|
||||||
InputPowerW *float64 `json:"input_power_w"`
|
InputPowerW *float64 `json:"input_power_w,omitempty"`
|
||||||
OutputPowerW *float64 `json:"output_power_w"`
|
OutputPowerW *float64 `json:"output_power_w,omitempty"`
|
||||||
InputVoltage *float64 `json:"input_voltage"`
|
InputVoltage *float64 `json:"input_voltage,omitempty"`
|
||||||
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareComponentStatus struct {
|
||||||
|
Status *string `json:"status,omitempty"`
|
||||||
|
StatusCheckedAt *string `json:"status_checked_at,omitempty"`
|
||||||
|
StatusChangedAt *string `json:"status_changed_at,omitempty"`
|
||||||
|
StatusHistory []HardwareStatusHistory `json:"status_history,omitempty"`
|
||||||
|
ErrorDescription *string `json:"error_description,omitempty"`
|
||||||
|
ManufacturedYearWeek *string `json:"manufactured_year_week,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareStatusHistory struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
ChangedAt string `json:"changed_at"`
|
||||||
|
Details *string `json:"details,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareSensors struct {
|
||||||
|
Fans []HardwareFanSensor `json:"fans,omitempty"`
|
||||||
|
Power []HardwarePowerSensor `json:"power,omitempty"`
|
||||||
|
Temperatures []HardwareTemperatureSensor `json:"temperatures,omitempty"`
|
||||||
|
Other []HardwareOtherSensor `json:"other,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareFanSensor struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Location *string `json:"location,omitempty"`
|
||||||
|
RPM *int `json:"rpm,omitempty"`
|
||||||
|
Status *string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwarePowerSensor struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Location *string `json:"location,omitempty"`
|
||||||
|
VoltageV *float64 `json:"voltage_v,omitempty"`
|
||||||
|
CurrentA *float64 `json:"current_a,omitempty"`
|
||||||
|
PowerW *float64 `json:"power_w,omitempty"`
|
||||||
|
Status *string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareTemperatureSensor struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Location *string `json:"location,omitempty"`
|
||||||
|
Celsius *float64 `json:"celsius,omitempty"`
|
||||||
|
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
||||||
|
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
||||||
|
Status *string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareOtherSensor struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Location *string `json:"location,omitempty"`
|
||||||
|
Value *float64 `json:"value,omitempty"`
|
||||||
|
Unit *string `json:"unit,omitempty"`
|
||||||
|
Status *string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareEventLog struct {
|
||||||
|
Source string `json:"source"`
|
||||||
|
EventTime *string `json:"event_time,omitempty"`
|
||||||
|
Severity *string `json:"severity,omitempty"`
|
||||||
|
MessageID *string `json:"message_id,omitempty"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
ComponentRef *string `json:"component_ref,omitempty"`
|
||||||
|
Fingerprint *string `json:"fingerprint,omitempty"`
|
||||||
|
IsActive *bool `json:"is_active,omitempty"`
|
||||||
|
RawPayload map[string]any `json:"raw_payload,omitempty"`
|
||||||
}
|
}
|
||||||
|
|||||||
46
audit/internal/schema/hardware_test.go
Normal file
46
audit/internal/schema/hardware_test.go
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
package schema
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
|
||||||
|
week := "2024-W07"
|
||||||
|
eventTime := "2026-03-15T14:03:11Z"
|
||||||
|
message := "Correctable ECC error threshold exceeded"
|
||||||
|
|
||||||
|
payload := HardwareIngestRequest{
|
||||||
|
CollectedAt: "2026-03-15T15:00:00Z",
|
||||||
|
Hardware: HardwareSnapshot{
|
||||||
|
Board: HardwareBoard{SerialNumber: "SRV-001"},
|
||||||
|
CPUs: []HardwareCPU{
|
||||||
|
{
|
||||||
|
HardwareComponentStatus: HardwareComponentStatus{
|
||||||
|
ManufacturedYearWeek: &week,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
EventLogs: []HardwareEventLog{
|
||||||
|
{
|
||||||
|
Source: "bmc",
|
||||||
|
EventTime: &eventTime,
|
||||||
|
Message: message,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := json.Marshal(payload)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("marshal: %v", err)
|
||||||
|
}
|
||||||
|
text := string(data)
|
||||||
|
if !strings.Contains(text, `"manufactured_year_week":"2024-W07"`) {
|
||||||
|
t.Fatalf("missing manufactured_year_week: %s", text)
|
||||||
|
}
|
||||||
|
if !strings.Contains(text, `"event_logs":[{"source":"bmc","event_time":"2026-03-15T14:03:11Z","message":"Correctable ECC error threshold exceeded"}]`) {
|
||||||
|
t.Fatalf("missing event_logs payload: %s", text)
|
||||||
|
}
|
||||||
|
}
|
||||||
445
audit/internal/webui/api.go
Normal file
445
audit/internal/webui/api.go
Normal file
@@ -0,0 +1,445 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
var jobCounter atomic.Uint64
|
||||||
|
|
||||||
|
func newJobID(prefix string) string {
|
||||||
|
return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func sseWrite(w http.ResponseWriter, event, data string) bool {
|
||||||
|
f, ok := w.(http.Flusher)
|
||||||
|
if !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if event != "" {
|
||||||
|
fmt.Fprintf(w, "event: %s\n", event)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(w, "data: %s\n\n", data)
|
||||||
|
f.Flush()
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func sseStart(w http.ResponseWriter) bool {
|
||||||
|
_, ok := w.(http.Flusher)
|
||||||
|
if !ok {
|
||||||
|
http.Error(w, "streaming not supported", http.StatusInternalServerError)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "text/event-stream")
|
||||||
|
w.Header().Set("Cache-Control", "no-cache")
|
||||||
|
w.Header().Set("Connection", "keep-alive")
|
||||||
|
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// streamJob streams lines from a jobState to a SSE response.
|
||||||
|
func streamJob(w http.ResponseWriter, r *http.Request, j *jobState) {
|
||||||
|
if !sseStart(w) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
existing, ch := j.subscribe()
|
||||||
|
for _, line := range existing {
|
||||||
|
sseWrite(w, "", line)
|
||||||
|
}
|
||||||
|
if ch == nil {
|
||||||
|
// Job already finished
|
||||||
|
sseWrite(w, "done", j.err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case line, ok := <-ch:
|
||||||
|
if !ok {
|
||||||
|
sseWrite(w, "done", j.err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
sseWrite(w, "", line)
|
||||||
|
case <-r.Context().Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// runCmdJob runs an exec.Cmd as a background job, streaming stdout+stderr lines.
|
||||||
|
func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
||||||
|
pr, pw := io.Pipe()
|
||||||
|
cmd.Stdout = pw
|
||||||
|
cmd.Stderr = pw
|
||||||
|
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
j.finish(err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
scanner := bufio.NewScanner(pr)
|
||||||
|
for scanner.Scan() {
|
||||||
|
j.append(scanner.Text())
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
err := cmd.Wait()
|
||||||
|
_ = pw.Close()
|
||||||
|
if err != nil {
|
||||||
|
j.finish(err.Error())
|
||||||
|
} else {
|
||||||
|
j.finish("")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Audit ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAPIAuditRun(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
id := newJobID("audit")
|
||||||
|
j := globalJobs.create(id)
|
||||||
|
go func() {
|
||||||
|
j.append("Running audit...")
|
||||||
|
result, err := h.opts.App.RunAuditNow(h.opts.RuntimeMode)
|
||||||
|
if err != nil {
|
||||||
|
j.append("ERROR: " + err.Error())
|
||||||
|
j.finish(err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(result.Body, "\n") {
|
||||||
|
if line != "" {
|
||||||
|
j.append(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
j.finish("")
|
||||||
|
}()
|
||||||
|
writeJSON(w, map[string]string{"job_id": id})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIAuditStream(w http.ResponseWriter, r *http.Request) {
|
||||||
|
id := r.URL.Query().Get("job_id")
|
||||||
|
j, ok := globalJobs.get(id)
|
||||||
|
if !ok {
|
||||||
|
http.Error(w, "job not found", http.StatusNotFound)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
streamJob(w, r, j)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── SAT ───────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||||
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
id := newJobID("sat-" + target)
|
||||||
|
j := globalJobs.create(id)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
j.append(fmt.Sprintf("Starting %s acceptance test...", target))
|
||||||
|
var (
|
||||||
|
archive string
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
|
||||||
|
// Parse optional parameters
|
||||||
|
var body struct {
|
||||||
|
Duration int `json:"duration"`
|
||||||
|
DiagLevel int `json:"diag_level"`
|
||||||
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
}
|
||||||
|
body.DiagLevel = 1
|
||||||
|
if r.ContentLength > 0 {
|
||||||
|
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch target {
|
||||||
|
case "nvidia":
|
||||||
|
if len(body.GPUIndices) > 0 || body.DiagLevel > 0 {
|
||||||
|
result, e := h.opts.App.RunNvidiaAcceptancePackWithOptions(
|
||||||
|
context.Background(), "", body.DiagLevel, body.GPUIndices,
|
||||||
|
)
|
||||||
|
if e != nil {
|
||||||
|
err = e
|
||||||
|
} else {
|
||||||
|
archive = result.Body
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
archive, err = h.opts.App.RunNvidiaAcceptancePack("")
|
||||||
|
}
|
||||||
|
case "memory":
|
||||||
|
archive, err = h.opts.App.RunMemoryAcceptancePack("")
|
||||||
|
case "storage":
|
||||||
|
archive, err = h.opts.App.RunStorageAcceptancePack("")
|
||||||
|
case "cpu":
|
||||||
|
dur := body.Duration
|
||||||
|
if dur <= 0 {
|
||||||
|
dur = 60
|
||||||
|
}
|
||||||
|
archive, err = h.opts.App.RunCPUAcceptancePack("", dur)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
j.append("ERROR: " + err.Error())
|
||||||
|
j.finish(err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
j.append(fmt.Sprintf("Archive written: %s", archive))
|
||||||
|
j.finish("")
|
||||||
|
}()
|
||||||
|
|
||||||
|
writeJSON(w, map[string]string{"job_id": id})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||||
|
id := r.URL.Query().Get("job_id")
|
||||||
|
j, ok := globalJobs.get(id)
|
||||||
|
if !ok {
|
||||||
|
http.Error(w, "job not found", http.StatusNotFound)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
streamJob(w, r, j)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Services ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAPIServicesList(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
names, err := h.opts.App.ListBeeServices()
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
type serviceInfo struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
State string `json:"state"`
|
||||||
|
Body string `json:"body"`
|
||||||
|
}
|
||||||
|
result := make([]serviceInfo, 0, len(names))
|
||||||
|
for _, name := range names {
|
||||||
|
state := h.opts.App.ServiceState(name)
|
||||||
|
body, _ := h.opts.App.ServiceStatus(name)
|
||||||
|
result = append(result, serviceInfo{Name: name, State: state, Body: body})
|
||||||
|
}
|
||||||
|
writeJSON(w, result)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Action string `json:"action"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var action platform.ServiceAction
|
||||||
|
switch req.Action {
|
||||||
|
case "start":
|
||||||
|
action = platform.ServiceStart
|
||||||
|
case "stop":
|
||||||
|
action = platform.ServiceStop
|
||||||
|
case "restart":
|
||||||
|
action = platform.ServiceRestart
|
||||||
|
default:
|
||||||
|
writeError(w, http.StatusBadRequest, "action must be start|stop|restart")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Network ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAPINetworkStatus(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ifaces, err := h.opts.App.ListInterfaces()
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]any{
|
||||||
|
"interfaces": ifaces,
|
||||||
|
"default_route": h.opts.App.DefaultRoute(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPINetworkDHCP(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req struct {
|
||||||
|
Interface string `json:"interface"`
|
||||||
|
}
|
||||||
|
_ = json.NewDecoder(r.Body).Decode(&req)
|
||||||
|
|
||||||
|
var result app.ActionResult
|
||||||
|
var err error
|
||||||
|
if req.Interface == "" || req.Interface == "all" {
|
||||||
|
result, err = h.opts.App.DHCPAllResult()
|
||||||
|
} else {
|
||||||
|
result, err = h.opts.App.DHCPOneResult(req.Interface)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPINetworkStatic(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req struct {
|
||||||
|
Interface string `json:"interface"`
|
||||||
|
Address string `json:"address"`
|
||||||
|
Prefix string `json:"prefix"`
|
||||||
|
Gateway string `json:"gateway"`
|
||||||
|
DNS []string `json:"dns"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cfg := platform.StaticIPv4Config{
|
||||||
|
Interface: req.Interface,
|
||||||
|
Address: req.Address,
|
||||||
|
Prefix: req.Prefix,
|
||||||
|
Gateway: req.Gateway,
|
||||||
|
DNS: req.DNS,
|
||||||
|
}
|
||||||
|
result, err := h.opts.App.SetStaticIPv4Result(cfg)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Export ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAPIExportList(w http.ResponseWriter, r *http.Request) {
|
||||||
|
entries, err := listExportFiles(h.opts.ExportDir)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, entries)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request) {
|
||||||
|
archive, err := app.BuildSupportBundle(h.opts.ExportDir)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{
|
||||||
|
"status": "ok",
|
||||||
|
"path": archive,
|
||||||
|
"url": "/export/support.tar.gz",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
var standardTools = []string{
|
||||||
|
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||||
|
"nvidia-smi", "memtester", "stress-ng", "nvtop",
|
||||||
|
"mstflint", "qrencode",
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
statuses := h.opts.App.CheckTools(standardTools)
|
||||||
|
writeJSON(w, statuses)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Preflight ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAPIPreflight(w http.ResponseWriter, r *http.Request) {
|
||||||
|
data, err := loadSnapshot(filepath.Join(h.opts.ExportDir, "runtime-health.json"))
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusNotFound, "runtime health not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_, _ = w.Write(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Metrics SSE ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if !sseStart(w) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ticker := time.NewTicker(time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-r.Context().Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
sample := platform.SampleLiveMetrics()
|
||||||
|
|
||||||
|
// Feed ring buffers for server-side SVG charts
|
||||||
|
for _, t := range sample.Temps {
|
||||||
|
if t.Name == "CPU" {
|
||||||
|
h.ringCPUTemp.push(t.Celsius)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h.ringPower.push(sample.PowerW)
|
||||||
|
|
||||||
|
b, err := json.Marshal(sample)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !sseWrite(w, "metrics", string(b)) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
84
audit/internal/webui/jobs.go
Normal file
84
audit/internal/webui/jobs.go
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// jobState holds the output lines and completion status of an async job.
|
||||||
|
type jobState struct {
|
||||||
|
lines []string
|
||||||
|
done bool
|
||||||
|
err string
|
||||||
|
mu sync.Mutex
|
||||||
|
// subs is a list of channels that receive new lines as they arrive.
|
||||||
|
subs []chan string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *jobState) append(line string) {
|
||||||
|
j.mu.Lock()
|
||||||
|
defer j.mu.Unlock()
|
||||||
|
j.lines = append(j.lines, line)
|
||||||
|
for _, ch := range j.subs {
|
||||||
|
select {
|
||||||
|
case ch <- line:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *jobState) finish(errMsg string) {
|
||||||
|
j.mu.Lock()
|
||||||
|
defer j.mu.Unlock()
|
||||||
|
j.done = true
|
||||||
|
j.err = errMsg
|
||||||
|
for _, ch := range j.subs {
|
||||||
|
close(ch)
|
||||||
|
}
|
||||||
|
j.subs = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// subscribe returns a channel that receives all future lines.
|
||||||
|
// Existing lines are returned first, then the channel streams new ones.
|
||||||
|
func (j *jobState) subscribe() ([]string, <-chan string) {
|
||||||
|
j.mu.Lock()
|
||||||
|
defer j.mu.Unlock()
|
||||||
|
existing := make([]string, len(j.lines))
|
||||||
|
copy(existing, j.lines)
|
||||||
|
if j.done {
|
||||||
|
return existing, nil
|
||||||
|
}
|
||||||
|
ch := make(chan string, 256)
|
||||||
|
j.subs = append(j.subs, ch)
|
||||||
|
return existing, ch
|
||||||
|
}
|
||||||
|
|
||||||
|
// jobManager manages async jobs identified by string IDs.
|
||||||
|
type jobManager struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
jobs map[string]*jobState
|
||||||
|
}
|
||||||
|
|
||||||
|
var globalJobs = &jobManager{jobs: make(map[string]*jobState)}
|
||||||
|
|
||||||
|
func (m *jobManager) create(id string) *jobState {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
j := &jobState{}
|
||||||
|
m.jobs[id] = j
|
||||||
|
// Schedule cleanup after 30 minutes
|
||||||
|
go func() {
|
||||||
|
time.Sleep(30 * time.Minute)
|
||||||
|
m.mu.Lock()
|
||||||
|
delete(m.jobs, id)
|
||||||
|
m.mu.Unlock()
|
||||||
|
}()
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *jobManager) get(id string) (*jobState, bool) {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
j, ok := m.jobs[id]
|
||||||
|
return j, ok
|
||||||
|
}
|
||||||
660
audit/internal/webui/pages.go
Normal file
660
audit/internal/webui/pages.go
Normal file
@@ -0,0 +1,660 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"net/url"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ── Layout ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func layoutHead(title string) string {
|
||||||
|
return `<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||||
|
<title>` + html.EscapeString(title) + `</title>
|
||||||
|
<style>
|
||||||
|
*{box-sizing:border-box;margin:0;padding:0}
|
||||||
|
body{font-family:system-ui,-apple-system,sans-serif;background:#0f1117;color:#e2e8f0;display:flex;min-height:100vh}
|
||||||
|
a{color:inherit;text-decoration:none}
|
||||||
|
/* Sidebar */
|
||||||
|
.sidebar{width:200px;min-height:100vh;background:#161b25;border-right:1px solid #252d3d;flex-shrink:0;display:flex;flex-direction:column}
|
||||||
|
.sidebar-logo{padding:20px 16px 12px;font-size:20px;font-weight:700;color:#60a5fa;letter-spacing:-0.5px}
|
||||||
|
.sidebar-logo span{color:#94a3b8;font-weight:400;font-size:13px;display:block;margin-top:2px}
|
||||||
|
.nav{flex:1}
|
||||||
|
.nav-item{display:block;padding:10px 16px;color:#94a3b8;font-size:14px;border-left:3px solid transparent;transition:all .15s}
|
||||||
|
.nav-item:hover,.nav-item.active{background:#1e2535;color:#e2e8f0;border-left-color:#3b82f6}
|
||||||
|
.nav-icon{margin-right:8px;opacity:.7}
|
||||||
|
/* Content */
|
||||||
|
.main{flex:1;display:flex;flex-direction:column;overflow:auto}
|
||||||
|
.topbar{padding:16px 24px;border-bottom:1px solid #1e2535;display:flex;align-items:center;gap:12px}
|
||||||
|
.topbar h1{font-size:18px;font-weight:600}
|
||||||
|
.content{padding:24px;flex:1}
|
||||||
|
/* Cards */
|
||||||
|
.card{background:#161b25;border:1px solid #1e2535;border-radius:10px;margin-bottom:16px}
|
||||||
|
.card-head{padding:14px 18px;border-bottom:1px solid #1e2535;font-weight:600;font-size:14px;display:flex;align-items:center;gap:8px}
|
||||||
|
.card-body{padding:18px}
|
||||||
|
/* Buttons */
|
||||||
|
.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:6px;font-size:13px;font-weight:600;cursor:pointer;border:none;transition:background .15s}
|
||||||
|
.btn-primary{background:#3b82f6;color:#fff}.btn-primary:hover{background:#2563eb}
|
||||||
|
.btn-danger{background:#ef4444;color:#fff}.btn-danger:hover{background:#dc2626}
|
||||||
|
.btn-secondary{background:#1e2535;color:#94a3b8;border:1px solid #252d3d}.btn-secondary:hover{background:#252d3d;color:#e2e8f0}
|
||||||
|
.btn-sm{padding:5px 10px;font-size:12px}
|
||||||
|
/* Tables */
|
||||||
|
table{width:100%;border-collapse:collapse;font-size:13px}
|
||||||
|
th{text-align:left;padding:8px 12px;color:#64748b;font-weight:600;border-bottom:1px solid #1e2535}
|
||||||
|
td{padding:8px 12px;border-bottom:1px solid #1a2030}
|
||||||
|
tr:last-child td{border:none}
|
||||||
|
tr:hover td{background:#1a2030}
|
||||||
|
/* Status badges */
|
||||||
|
.badge{display:inline-block;padding:2px 8px;border-radius:999px;font-size:11px;font-weight:600}
|
||||||
|
.badge-ok{background:#166534;color:#86efac}
|
||||||
|
.badge-warn{background:#713f12;color:#fde68a}
|
||||||
|
.badge-err{background:#7f1d1d;color:#fca5a5}
|
||||||
|
.badge-unknown{background:#1e293b;color:#64748b}
|
||||||
|
/* Output terminal */
|
||||||
|
.terminal{background:#0a0d14;border:1px solid #1e2535;border-radius:8px;padding:14px;font-family:monospace;font-size:12px;color:#86efac;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all}
|
||||||
|
/* Forms */
|
||||||
|
.form-row{margin-bottom:14px}
|
||||||
|
.form-row label{display:block;font-size:12px;color:#64748b;margin-bottom:5px}
|
||||||
|
.form-row input,.form-row select{width:100%;padding:8px 10px;background:#0f1117;border:1px solid #252d3d;border-radius:6px;color:#e2e8f0;font-size:13px;outline:none}
|
||||||
|
.form-row input:focus,.form-row select:focus{border-color:#3b82f6}
|
||||||
|
.chart-legend{font-size:11px;color:#64748b;padding:4px 0}
|
||||||
|
/* Grid */
|
||||||
|
.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
|
||||||
|
.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
|
||||||
|
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}}
|
||||||
|
/* iframe viewer */
|
||||||
|
.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:8px;background:#1a1f2e}
|
||||||
|
/* Alerts */
|
||||||
|
.alert{padding:10px 14px;border-radius:8px;font-size:13px;margin-bottom:14px}
|
||||||
|
.alert-info{background:#1e3a5f;border:1px solid #2563eb;color:#93c5fd}
|
||||||
|
.alert-warn{background:#451a03;border:1px solid #d97706;color:#fde68a}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
|
func layoutNav(active string) string {
|
||||||
|
items := []struct{ id, icon, label string }{
|
||||||
|
{"dashboard", "", "Dashboard"},
|
||||||
|
{"metrics", "", "Metrics"},
|
||||||
|
{"tests", "", "Acceptance Tests"},
|
||||||
|
{"burn-in", "", "Burn-in"},
|
||||||
|
{"network", "", "Network"},
|
||||||
|
{"services", "", "Services"},
|
||||||
|
{"export", "", "Export"},
|
||||||
|
{"tools", "", "Tools"},
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<aside class="sidebar">`)
|
||||||
|
b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
|
||||||
|
b.WriteString(`<nav class="nav">`)
|
||||||
|
for _, item := range items {
|
||||||
|
cls := "nav-item"
|
||||||
|
if item.id == active {
|
||||||
|
cls += " active"
|
||||||
|
}
|
||||||
|
href := "/"
|
||||||
|
if item.id != "dashboard" {
|
||||||
|
href = "/" + item.id
|
||||||
|
}
|
||||||
|
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||||
|
cls, href, item.label))
|
||||||
|
}
|
||||||
|
b.WriteString(`</nav></aside>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// renderPage dispatches to the appropriate page renderer.
|
||||||
|
func renderPage(page string, opts HandlerOptions) string {
|
||||||
|
var pageID, title, body string
|
||||||
|
switch page {
|
||||||
|
case "dashboard", "":
|
||||||
|
pageID = "dashboard"
|
||||||
|
title = "Dashboard"
|
||||||
|
body = renderDashboard(opts)
|
||||||
|
case "metrics":
|
||||||
|
pageID = "metrics"
|
||||||
|
title = "Live Metrics"
|
||||||
|
body = renderMetrics()
|
||||||
|
case "tests":
|
||||||
|
pageID = "tests"
|
||||||
|
title = "Acceptance Tests"
|
||||||
|
body = renderTests()
|
||||||
|
case "burn-in":
|
||||||
|
pageID = "burn-in"
|
||||||
|
title = "Burn-in Tests"
|
||||||
|
body = renderBurnIn()
|
||||||
|
case "network":
|
||||||
|
pageID = "network"
|
||||||
|
title = "Network"
|
||||||
|
body = renderNetwork()
|
||||||
|
case "services":
|
||||||
|
pageID = "services"
|
||||||
|
title = "Services"
|
||||||
|
body = renderServices()
|
||||||
|
case "export":
|
||||||
|
pageID = "export"
|
||||||
|
title = "Export"
|
||||||
|
body = renderExport(opts.ExportDir)
|
||||||
|
case "tools":
|
||||||
|
pageID = "tools"
|
||||||
|
title = "Tools"
|
||||||
|
body = renderTools()
|
||||||
|
default:
|
||||||
|
pageID = "dashboard"
|
||||||
|
title = "Not Found"
|
||||||
|
body = `<div class="alert alert-warn">Page not found.</div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
return layoutHead(opts.Title+" — "+title) +
|
||||||
|
layoutNav(pageID) +
|
||||||
|
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
||||||
|
body +
|
||||||
|
`</div></div></body></html>`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Dashboard ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func renderDashboard(opts HandlerOptions) string {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="grid2">`)
|
||||||
|
// Left: health summary
|
||||||
|
b.WriteString(`<div>`)
|
||||||
|
b.WriteString(renderHealthCard(opts))
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
// Right: quick actions
|
||||||
|
b.WriteString(`<div>`)
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Quick Actions</div><div class="card-body">`)
|
||||||
|
b.WriteString(`<a class="btn btn-primary" href="/export/support.tar.gz" style="display:block;margin-bottom:10px">⬇ Download Support Bundle</a>`)
|
||||||
|
b.WriteString(`<a class="btn btn-secondary" href="/audit.json" style="display:block;margin-bottom:10px" target="_blank">📄 Open audit.json</a>`)
|
||||||
|
b.WriteString(`<a class="btn btn-secondary" href="/export/" style="display:block">📁 Browse Export Files</a>`)
|
||||||
|
b.WriteString(`<div style="margin-top:14px"><button class="btn btn-secondary" onclick="runAudit()">▶ Re-run Audit</button></div>`)
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
// Audit viewer iframe
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Audit Snapshot</div><div class="card-body" style="padding:0">`)
|
||||||
|
b.WriteString(`<iframe class="viewer-frame" src="/viewer" loading="eager" referrerpolicy="same-origin"></iframe>`)
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
|
||||||
|
// Audit run output div
|
||||||
|
b.WriteString(`<div id="audit-output" style="display:none" class="card"><div class="card-head">Audit Output</div><div class="card-body"><div id="audit-terminal" class="terminal"></div></div></div>`)
|
||||||
|
|
||||||
|
b.WriteString(`<script>
|
||||||
|
function runAudit() {
|
||||||
|
document.getElementById('audit-output').style.display='block';
|
||||||
|
const term = document.getElementById('audit-terminal');
|
||||||
|
term.textContent = 'Starting audit...\n';
|
||||||
|
fetch('/api/audit/run', {method:'POST'})
|
||||||
|
.then(r => r.json())
|
||||||
|
.then(d => {
|
||||||
|
const es = new EventSource('/api/audit/stream?job_id=' + d.job_id);
|
||||||
|
es.onmessage = e => { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
es.addEventListener('done', e => { es.close(); term.textContent += (e.data ? '\\nERROR: ' + e.data : '\\nDone.') + '\n'; location.reload(); });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderHealthCard(opts HandlerOptions) string {
|
||||||
|
data, err := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json"))
|
||||||
|
if err != nil {
|
||||||
|
return `<div class="card"><div class="card-head">Runtime Health</div><div class="card-body"><span class="badge badge-unknown">No data</span></div></div>`
|
||||||
|
}
|
||||||
|
var health map[string]any
|
||||||
|
if err := json.Unmarshal(data, &health); err != nil {
|
||||||
|
return `<div class="card"><div class="card-head">Runtime Health</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||||
|
}
|
||||||
|
status := fmt.Sprintf("%v", health["status"])
|
||||||
|
badge := "badge-ok"
|
||||||
|
if status == "PARTIAL" {
|
||||||
|
badge = "badge-warn"
|
||||||
|
} else if status == "FAIL" || status == "FAILED" {
|
||||||
|
badge = "badge-err"
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Runtime Health</div><div class="card-body">`)
|
||||||
|
b.WriteString(fmt.Sprintf(`<div style="margin-bottom:10px"><span class="badge %s">%s</span></div>`, badge, html.EscapeString(status)))
|
||||||
|
if issues, ok := health["issues"].([]any); ok && len(issues) > 0 {
|
||||||
|
b.WriteString(`<div style="font-size:12px;color:#f87171">Issues:<br>`)
|
||||||
|
for _, issue := range issues {
|
||||||
|
if m, ok := issue.(map[string]any); ok {
|
||||||
|
b.WriteString(html.EscapeString(fmt.Sprintf("%v: %v", m["code"], m["message"])) + "<br>")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Metrics ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func renderMetrics() string {
|
||||||
|
return `<p style="color:#64748b;font-size:13px;margin-bottom:16px">Live server metrics, charts updated every 2 seconds.</p>
|
||||||
|
<div class="grid2">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">System</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<img id="chart-cpu-temp" src="/api/metrics/chart/cpu-temp.svg" style="width:100%;border-radius:6px" alt="CPU Temp">
|
||||||
|
<img id="chart-power" src="/api/metrics/chart/power.svg" style="width:100%;border-radius:6px;margin-top:8px" alt="Power">
|
||||||
|
<div id="sys-table" style="margin-top:8px"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">GPU</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div id="gpu-table"><p style="color:#64748b;font-size:12px">Waiting for data...</p></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function refreshCharts() {
|
||||||
|
const t = '?t=' + Date.now();
|
||||||
|
['chart-cpu-temp','chart-power'].forEach(id => {
|
||||||
|
const el = document.getElementById(id);
|
||||||
|
if (el) el.src = el.src.split('?')[0] + t;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
setInterval(refreshCharts, 2000);
|
||||||
|
|
||||||
|
const es = new EventSource('/api/metrics/stream');
|
||||||
|
es.addEventListener('metrics', e => {
|
||||||
|
const d = JSON.parse(e.data);
|
||||||
|
const gpuRows = (d.gpus||[]).map(g =>
|
||||||
|
'<tr><td>GPU '+g.index+'</td><td>'+g.temp_c+'°C</td><td>'+g.usage_pct+'%</td><td>'+g.power_w+'W</td><td>'+g.clock_mhz+'MHz</td></tr>'
|
||||||
|
).join('');
|
||||||
|
document.getElementById('gpu-table').innerHTML = gpuRows ?
|
||||||
|
'<table><tr><th>GPU</th><th>Temp</th><th>Usage</th><th>Power</th><th>Clock</th></tr>'+gpuRows+'</table>' :
|
||||||
|
'<p style="color:#64748b;font-size:12px">No NVIDIA GPU detected</p>';
|
||||||
|
|
||||||
|
let sysHTML = '';
|
||||||
|
const cpuTemp = (d.temps||[]).find(t => t.name==='CPU');
|
||||||
|
if (cpuTemp) sysHTML += '<tr><td>CPU Temp</td><td>'+cpuTemp.celsius.toFixed(1)+'°C</td></tr>';
|
||||||
|
(d.fans||[]).forEach(f => sysHTML += '<tr><td>'+f.name+'</td><td>'+f.rpm+' RPM</td></tr>');
|
||||||
|
if (d.power_w) sysHTML += '<tr><td>System Power</td><td>'+d.power_w.toFixed(0)+'W</td></tr>';
|
||||||
|
document.getElementById('sys-table').innerHTML = sysHTML ?
|
||||||
|
'<table>'+sysHTML+'</table>' :
|
||||||
|
'<p style="color:#64748b;font-size:12px">No sensor data (ipmitool/sensors required)</p>';
|
||||||
|
});
|
||||||
|
es.onerror = () => {};
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Acceptance Tests ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func renderTests() string {
|
||||||
|
return `<p style="color:#64748b;font-size:13px;margin-bottom:16px">Run hardware acceptance tests and view results.</p>
|
||||||
|
<div class="grid2">
|
||||||
|
` + renderSATCard("nvidia", "NVIDIA GPU", `<div class="form-row"><label>Diag Level</label><select id="sat-nvidia-level"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>`) +
|
||||||
|
renderSATCard("memory", "Memory", "") +
|
||||||
|
renderSATCard("storage", "Storage", "") +
|
||||||
|
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
|
||||||
|
`</div>
|
||||||
|
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||||
|
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||||
|
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
let satES = null;
|
||||||
|
function runSAT(target) {
|
||||||
|
if (satES) satES.close();
|
||||||
|
const body = {};
|
||||||
|
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||||
|
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||||
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + target;
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
term.textContent = 'Starting ' + target + ' test...\n';
|
||||||
|
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||||
|
.then(r => r.json())
|
||||||
|
.then(d => {
|
||||||
|
satES = new EventSource('/api/sat/stream?job_id='+d.job_id);
|
||||||
|
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||||
|
satES.addEventListener('done', e => { satES.close(); term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderSATCard(id, label, extra string) string {
|
||||||
|
return fmt.Sprintf(`<div class="card"><div class="card-head">%s</div><div class="card-body">%s<button class="btn btn-primary" onclick="runSAT('%s')">▶ Run Test</button></div></div>`,
|
||||||
|
label, extra, id)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Burn-in ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func renderBurnIn() string {
|
||||||
|
return `<p style="color:#64748b;font-size:13px;margin-bottom:16px">Long-running GPU and system stress tests. Check <a href="/metrics" style="color:#60a5fa">Metrics</a> page for live telemetry.</p>
|
||||||
|
<div class="grid2">
|
||||||
|
<div class="card"><div class="card-head">GPU Platform Stress</div><div class="card-body">
|
||||||
|
<div class="form-row"><label>Duration</label><select id="bi-dur"><option value="600">10 minutes</option><option value="3600">1 hour</option><option value="28800">8 hours</option><option value="86400">24 hours</option></select></div>
|
||||||
|
<button class="btn btn-primary" onclick="runBurnIn('nvidia')">▶ Start GPU Stress</button>
|
||||||
|
</div></div>
|
||||||
|
<div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
|
||||||
|
<div class="form-row"><label>Duration (seconds)</label><input type="number" id="bi-cpu-dur" value="300" min="60"></div>
|
||||||
|
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
||||||
|
</div></div>
|
||||||
|
</div>
|
||||||
|
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||||
|
<div class="card-head">Output</div>
|
||||||
|
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
let biES = null;
|
||||||
|
function runBurnIn(target) {
|
||||||
|
if (biES) biES.close();
|
||||||
|
const body = {};
|
||||||
|
if (target === 'nvidia') body.duration = parseInt(document.getElementById('bi-dur').value)||600;
|
||||||
|
if (target === 'cpu') body.duration = parseInt(document.getElementById('bi-cpu-dur').value)||300;
|
||||||
|
document.getElementById('bi-output').style.display='block';
|
||||||
|
const term = document.getElementById('bi-terminal');
|
||||||
|
term.textContent = 'Starting ' + target + ' burn-in...\n';
|
||||||
|
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||||
|
.then(r => r.json())
|
||||||
|
.then(d => {
|
||||||
|
biES = new EventSource('/api/sat/stream?job_id='+d.job_id);
|
||||||
|
biES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||||
|
biES.addEventListener('done', e => { biES.close(); term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Network ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func renderNetwork() string {
|
||||||
|
return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">
|
||||||
|
<div id="iface-table"><p style="color:#64748b;font-size:13px">Loading...</p></div>
|
||||||
|
</div></div>
|
||||||
|
<div class="grid2">
|
||||||
|
<div class="card"><div class="card-head">DHCP</div><div class="card-body">
|
||||||
|
<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
|
||||||
|
<button class="btn btn-primary" onclick="runDHCP()">▶ Run DHCP</button>
|
||||||
|
<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:#86efac"></div>
|
||||||
|
</div></div>
|
||||||
|
<div class="card"><div class="card-head">Static IPv4</div><div class="card-body">
|
||||||
|
<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
|
||||||
|
<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
|
||||||
|
<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
|
||||||
|
<div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
|
||||||
|
<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
|
||||||
|
<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
|
||||||
|
<div id="static-out" style="margin-top:10px;font-size:12px;color:#86efac"></div>
|
||||||
|
</div></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function loadNetwork() {
|
||||||
|
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||||
|
const rows = (d.interfaces||[]).map(i =>
|
||||||
|
'<tr><td>'+i.Name+'</td><td><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td><td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
||||||
|
).join('');
|
||||||
|
document.getElementById('iface-table').innerHTML =
|
||||||
|
'<table><tr><th>Interface</th><th>State</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||||
|
(d.default_route ? '<p style="font-size:12px;color:#64748b;margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function runDHCP() {
|
||||||
|
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||||
|
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||||
|
.then(r=>r.json()).then(d => {
|
||||||
|
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||||
|
loadNetwork();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function setStatic() {
|
||||||
|
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||||
|
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||||
|
interface: document.getElementById('st-iface').value,
|
||||||
|
address: document.getElementById('st-addr').value,
|
||||||
|
prefix: document.getElementById('st-prefix').value,
|
||||||
|
gateway: document.getElementById('st-gw').value,
|
||||||
|
dns: dns,
|
||||||
|
})}).then(r=>r.json()).then(d => {
|
||||||
|
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||||
|
loadNetwork();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadNetwork();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Services ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func renderServices() string {
|
||||||
|
return `<div class="card"><div class="card-head">Bee Services <button class="btn btn-sm btn-secondary" onclick="loadServices()" style="margin-left:auto">↻ Refresh</button></div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div id="svc-table"><p style="color:#64748b;font-size:13px">Loading...</p></div>
|
||||||
|
</div></div>
|
||||||
|
<div id="svc-out" style="display:none;margin-top:8px" class="card">
|
||||||
|
<div class="card-head">Output</div>
|
||||||
|
<div class="card-body" style="padding:10px"><div id="svc-terminal" class="terminal" style="max-height:150px"></div></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function loadServices() {
|
||||||
|
fetch('/api/services').then(r=>r.json()).then(svcs => {
|
||||||
|
const rows = svcs.map(s => {
|
||||||
|
const st = s.state||'unknown';
|
||||||
|
const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
|
||||||
|
const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
|
||||||
|
const body = (s.body||'').replace(/</g,'<').replace(/>/g,'>');
|
||||||
|
return '<tr>' +
|
||||||
|
'<td style="white-space:nowrap">'+s.name+'</td>' +
|
||||||
|
'<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
|
||||||
|
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#0a0d14;padding:8px;border-radius:6px;color:#94a3b8">'+body+'</pre></div>' +
|
||||||
|
'</td>' +
|
||||||
|
'<td style="white-space:nowrap">' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'start\')">Start</button> ' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'stop\')">Stop</button> ' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'restart\')">Restart</button>' +
|
||||||
|
'</td></tr>';
|
||||||
|
}).join('');
|
||||||
|
document.getElementById('svc-table').innerHTML =
|
||||||
|
'<table><tr><th>Service</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function toggleBody(id) {
|
||||||
|
const el = document.getElementById(id);
|
||||||
|
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
||||||
|
}
|
||||||
|
function svcAction(name, action) {
|
||||||
|
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
||||||
|
.then(r=>r.json()).then(d => {
|
||||||
|
document.getElementById('svc-out').style.display='block';
|
||||||
|
document.getElementById('svc-terminal').textContent = d.output || d.error || action+' '+name;
|
||||||
|
setTimeout(loadServices, 1000);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadServices();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Export ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func renderExport(exportDir string) string {
|
||||||
|
entries, _ := listExportFiles(exportDir)
|
||||||
|
var rows strings.Builder
|
||||||
|
for _, e := range entries {
|
||||||
|
rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
|
||||||
|
url.QueryEscape(e), html.EscapeString(e)))
|
||||||
|
}
|
||||||
|
if len(entries) == 0 {
|
||||||
|
rows.WriteString(`<tr><td style="color:#64748b">No export files found.</td></tr>`)
|
||||||
|
}
|
||||||
|
return `<div class="grid2">
|
||||||
|
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||||
|
<p style="font-size:13px;color:#94a3b8;margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
|
<a class="btn btn-primary" href="/export/support.tar.gz">⬇ Download Support Bundle</a>
|
||||||
|
</div></div>
|
||||||
|
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||||
|
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||||
|
</div></div>
|
||||||
|
</div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func listExportFiles(exportDir string) ([]string, error) {
|
||||||
|
var entries []string
|
||||||
|
err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(exportDir, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
entries = append(entries, rel)
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
if err != nil && !os.IsNotExist(err) {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
sort.Strings(entries)
|
||||||
|
return entries, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func renderTools() string {
|
||||||
|
return `<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||||
|
<div class="card-body"><div id="tools-table"><p style="color:#64748b;font-size:13px">Click Check to verify installed tools.</p></div></div></div>
|
||||||
|
<script>
|
||||||
|
function checkTools() {
|
||||||
|
document.getElementById('tools-table').innerHTML = '<p style="color:#64748b;font-size:13px">Checking...</p>';
|
||||||
|
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||||
|
const rows = tools.map(t =>
|
||||||
|
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||||
|
).join('');
|
||||||
|
document.getElementById('tools-table').innerHTML =
|
||||||
|
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
checkTools();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Viewer (compatibility) ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// renderViewerPage renders the audit snapshot as a styled HTML page.
|
||||||
|
// This endpoint is embedded as an iframe on the Dashboard page.
|
||||||
|
func renderViewerPage(title string, snapshot []byte) string {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8">`)
|
||||||
|
b.WriteString(`<title>` + html.EscapeString(title) + `</title>`)
|
||||||
|
b.WriteString(`<style>
|
||||||
|
*{box-sizing:border-box;margin:0;padding:0}
|
||||||
|
body{font-family:system-ui,sans-serif;background:#0f1117;color:#e2e8f0;padding:20px}
|
||||||
|
h2{font-size:14px;color:#64748b;margin-bottom:8px;margin-top:16px;text-transform:uppercase;letter-spacing:.05em}
|
||||||
|
.grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:12px}
|
||||||
|
.card{background:#161b25;border:1px solid #1e2535;border-radius:8px;padding:14px}
|
||||||
|
.card-title{font-size:12px;color:#64748b;margin-bottom:6px}
|
||||||
|
.card-value{font-size:15px;font-weight:600}
|
||||||
|
.badge{display:inline-block;padding:2px 8px;border-radius:999px;font-size:11px;font-weight:600}
|
||||||
|
.ok{background:#166534;color:#86efac}.warn{background:#713f12;color:#fde68a}.err{background:#7f1d1d;color:#fca5a5}
|
||||||
|
pre{background:#0a0d14;border:1px solid #1e2535;border-radius:6px;padding:12px;font-size:11px;overflow-x:auto;color:#94a3b8;white-space:pre-wrap;word-break:break-word;max-height:400px;overflow-y:auto}
|
||||||
|
</style></head><body>
|
||||||
|
`)
|
||||||
|
if len(snapshot) == 0 {
|
||||||
|
b.WriteString(`<p style="color:#64748b">No audit snapshot available yet. Re-run audit from the Dashboard.</p>`)
|
||||||
|
b.WriteString(`</body></html>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
var data map[string]any
|
||||||
|
if err := json.Unmarshal(snapshot, &data); err != nil {
|
||||||
|
// Fallback: render raw JSON
|
||||||
|
b.WriteString(`<pre>` + html.EscapeString(string(snapshot)) + `</pre>`)
|
||||||
|
b.WriteString(`</body></html>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collected at
|
||||||
|
if t, ok := data["collected_at"].(string); ok {
|
||||||
|
b.WriteString(`<p style="font-size:12px;color:#64748b;margin-bottom:16px">Collected: ` + html.EscapeString(t) + `</p>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hardware section
|
||||||
|
hw, _ := data["hardware"].(map[string]any)
|
||||||
|
if hw == nil {
|
||||||
|
hw = data
|
||||||
|
}
|
||||||
|
|
||||||
|
renderHWCards(&b, hw)
|
||||||
|
|
||||||
|
// Full JSON below
|
||||||
|
b.WriteString(`<h2>Raw JSON</h2>`)
|
||||||
|
pretty, _ := json.MarshalIndent(data, "", " ")
|
||||||
|
b.WriteString(`<pre>` + html.EscapeString(string(pretty)) + `</pre>`)
|
||||||
|
b.WriteString(`</body></html>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderHWCards(b *strings.Builder, hw map[string]any) {
|
||||||
|
sections := []struct{ key, label string }{
|
||||||
|
{"board", "Board"},
|
||||||
|
{"cpus", "CPUs"},
|
||||||
|
{"memory", "Memory"},
|
||||||
|
{"storage", "Storage"},
|
||||||
|
{"gpus", "GPUs"},
|
||||||
|
{"nics", "NICs"},
|
||||||
|
{"psus", "Power Supplies"},
|
||||||
|
}
|
||||||
|
for _, s := range sections {
|
||||||
|
v, ok := hw[s.key]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
b.WriteString(`<h2>` + s.label + `</h2><div class="grid">`)
|
||||||
|
renderValue(b, v)
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderValue(b *strings.Builder, v any) {
|
||||||
|
switch val := v.(type) {
|
||||||
|
case []any:
|
||||||
|
for _, item := range val {
|
||||||
|
renderValue(b, item)
|
||||||
|
}
|
||||||
|
case map[string]any:
|
||||||
|
b.WriteString(`<div class="card">`)
|
||||||
|
for k, vv := range val {
|
||||||
|
b.WriteString(fmt.Sprintf(`<div class="card-title">%s</div><div class="card-value">%s</div>`,
|
||||||
|
html.EscapeString(k), html.EscapeString(fmt.Sprintf("%v", vv))))
|
||||||
|
}
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Export index (compatibility) ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
func renderExportIndex(exportDir string) (string, error) {
|
||||||
|
entries, err := listExportFiles(exportDir)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
|
||||||
|
body.WriteString(`<h1>Bee Export Files</h1><ul>`)
|
||||||
|
for _, entry := range entries {
|
||||||
|
body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
|
||||||
|
}
|
||||||
|
if len(entries) == 0 {
|
||||||
|
body.WriteString(`<li>No export files found.</li>`)
|
||||||
|
}
|
||||||
|
body.WriteString(`</ul></body></html>`)
|
||||||
|
return body.String(), nil
|
||||||
|
}
|
||||||
339
audit/internal/webui/server.go
Normal file
339
audit/internal/webui/server.go
Normal file
@@ -0,0 +1,339 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/runtimeenv"
|
||||||
|
gocharts "github.com/go-analyze/charts"
|
||||||
|
"reanimator/chart/viewer"
|
||||||
|
"reanimator/chart/web"
|
||||||
|
)
|
||||||
|
|
||||||
|
const defaultTitle = "Bee Hardware Audit"
|
||||||
|
|
||||||
|
// HandlerOptions configures the web UI handler.
|
||||||
|
type HandlerOptions struct {
|
||||||
|
Title string
|
||||||
|
AuditPath string
|
||||||
|
ExportDir string
|
||||||
|
App *app.App
|
||||||
|
RuntimeMode runtimeenv.Mode
|
||||||
|
}
|
||||||
|
|
||||||
|
// metricsRing holds a rolling window of live metric samples.
|
||||||
|
type metricsRing struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
vals []float64
|
||||||
|
labels []string
|
||||||
|
size int
|
||||||
|
}
|
||||||
|
|
||||||
|
func newMetricsRing(size int) *metricsRing {
|
||||||
|
return &metricsRing{size: size, vals: make([]float64, 0, size), labels: make([]string, 0, size)}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *metricsRing) push(v float64) {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
if len(r.vals) >= r.size {
|
||||||
|
r.vals = r.vals[1:]
|
||||||
|
r.labels = r.labels[1:]
|
||||||
|
}
|
||||||
|
r.vals = append(r.vals, v)
|
||||||
|
r.labels = append(r.labels, time.Now().Format("15:04"))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *metricsRing) snapshot() ([]float64, []string) {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
v := make([]float64, len(r.vals))
|
||||||
|
l := make([]string, len(r.labels))
|
||||||
|
copy(v, r.vals)
|
||||||
|
copy(l, r.labels)
|
||||||
|
return v, l
|
||||||
|
}
|
||||||
|
|
||||||
|
// handler is the HTTP handler for the web UI.
|
||||||
|
type handler struct {
|
||||||
|
opts HandlerOptions
|
||||||
|
mux *http.ServeMux
|
||||||
|
ringCPUTemp *metricsRing
|
||||||
|
ringPower *metricsRing
|
||||||
|
ringFans []*metricsRing
|
||||||
|
ringGPUTemp []*metricsRing
|
||||||
|
ringGPUUtil []*metricsRing
|
||||||
|
ringsMu sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewHandler creates the HTTP mux with all routes.
|
||||||
|
func NewHandler(opts HandlerOptions) http.Handler {
|
||||||
|
if strings.TrimSpace(opts.Title) == "" {
|
||||||
|
opts.Title = defaultTitle
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(opts.ExportDir) == "" {
|
||||||
|
opts.ExportDir = app.DefaultExportDir
|
||||||
|
}
|
||||||
|
if opts.RuntimeMode == "" {
|
||||||
|
opts.RuntimeMode = runtimeenv.ModeAuto
|
||||||
|
}
|
||||||
|
|
||||||
|
h := &handler{
|
||||||
|
opts: opts,
|
||||||
|
ringCPUTemp: newMetricsRing(120),
|
||||||
|
ringPower: newMetricsRing(120),
|
||||||
|
}
|
||||||
|
mux := http.NewServeMux()
|
||||||
|
|
||||||
|
// ── Infrastructure ──────────────────────────────────────────────────────
|
||||||
|
mux.HandleFunc("GET /healthz", h.handleHealthz)
|
||||||
|
|
||||||
|
// ── Existing read-only endpoints (preserved for compatibility) ──────────
|
||||||
|
mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
|
||||||
|
mux.HandleFunc("GET /runtime-health.json", h.handleRuntimeHealthJSON)
|
||||||
|
mux.HandleFunc("GET /export/support.tar.gz", h.handleSupportBundleDownload)
|
||||||
|
mux.HandleFunc("GET /export/file", h.handleExportFile)
|
||||||
|
mux.HandleFunc("GET /export/", h.handleExportIndex)
|
||||||
|
mux.HandleFunc("GET /viewer", h.handleViewer)
|
||||||
|
|
||||||
|
// ── API ──────────────────────────────────────────────────────────────────
|
||||||
|
// Audit
|
||||||
|
mux.HandleFunc("POST /api/audit/run", h.handleAPIAuditRun)
|
||||||
|
mux.HandleFunc("GET /api/audit/stream", h.handleAPIAuditStream)
|
||||||
|
|
||||||
|
// SAT
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
|
||||||
|
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
||||||
|
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||||
|
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||||
|
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||||
|
|
||||||
|
// Services
|
||||||
|
mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
|
||||||
|
mux.HandleFunc("POST /api/services/action", h.handleAPIServicesAction)
|
||||||
|
|
||||||
|
// Network
|
||||||
|
mux.HandleFunc("GET /api/network", h.handleAPINetworkStatus)
|
||||||
|
mux.HandleFunc("POST /api/network/dhcp", h.handleAPINetworkDHCP)
|
||||||
|
mux.HandleFunc("POST /api/network/static", h.handleAPINetworkStatic)
|
||||||
|
|
||||||
|
// Export
|
||||||
|
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||||
|
mux.HandleFunc("POST /api/export/bundle", h.handleAPIExportBundle)
|
||||||
|
|
||||||
|
// Tools
|
||||||
|
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||||
|
|
||||||
|
// Preflight
|
||||||
|
mux.HandleFunc("GET /api/preflight", h.handleAPIPreflight)
|
||||||
|
|
||||||
|
// Metrics — SSE stream of live sensor data + server-side SVG charts
|
||||||
|
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
||||||
|
mux.HandleFunc("GET /api/metrics/chart/", h.handleMetricsChartSVG)
|
||||||
|
|
||||||
|
// Reanimator chart static assets
|
||||||
|
mux.Handle("GET /chart/static/", http.StripPrefix("/chart/static/", web.Static()))
|
||||||
|
|
||||||
|
// ── Pages ────────────────────────────────────────────────────────────────
|
||||||
|
mux.HandleFunc("GET /", h.handlePage)
|
||||||
|
|
||||||
|
h.mux = mux
|
||||||
|
return mux
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListenAndServe starts the HTTP server.
|
||||||
|
func ListenAndServe(addr string, opts HandlerOptions) error {
|
||||||
|
return http.ListenAndServe(addr, NewHandler(opts))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Infrastructure handlers ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleHealthz(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
_, _ = w.Write([]byte("ok"))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Compatibility endpoints ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAuditJSON(w http.ResponseWriter, r *http.Request) {
|
||||||
|
data, err := loadSnapshot(h.opts.AuditPath)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
http.Error(w, "audit snapshot not found", http.StatusNotFound)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
http.Error(w, fmt.Sprintf("read audit snapshot: %v", err), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
_, _ = w.Write(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleRuntimeHealthJSON(w http.ResponseWriter, r *http.Request) {
|
||||||
|
data, err := loadSnapshot(filepath.Join(h.opts.ExportDir, "runtime-health.json"))
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
http.Error(w, "runtime health not found", http.StatusNotFound)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
http.Error(w, fmt.Sprintf("read runtime health: %v", err), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
_, _ = w.Write(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleSupportBundleDownload(w http.ResponseWriter, r *http.Request) {
|
||||||
|
archive, err := app.BuildSupportBundle(h.opts.ExportDir)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, fmt.Sprintf("build support bundle: %v", err), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "application/gzip")
|
||||||
|
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%q", filepath.Base(archive)))
|
||||||
|
http.ServeFile(w, r, archive)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleExportFile(w http.ResponseWriter, r *http.Request) {
|
||||||
|
rel := strings.TrimSpace(r.URL.Query().Get("path"))
|
||||||
|
if rel == "" {
|
||||||
|
http.Error(w, "path is required", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
clean := filepath.Clean(rel)
|
||||||
|
if clean == "." || strings.HasPrefix(clean, "..") {
|
||||||
|
http.Error(w, "invalid path", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
http.ServeFile(w, r, filepath.Join(h.opts.ExportDir, clean))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleExportIndex(w http.ResponseWriter, r *http.Request) {
|
||||||
|
body, err := renderExportIndex(h.opts.ExportDir)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, fmt.Sprintf("render export index: %v", err), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
_, _ = w.Write([]byte(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) {
|
||||||
|
snapshot, _ := loadSnapshot(h.opts.AuditPath)
|
||||||
|
body, err := viewer.RenderHTML(snapshot, h.opts.Title)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
_, _ = w.Write(body)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request) {
|
||||||
|
name := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
||||||
|
name = strings.TrimSuffix(name, ".svg")
|
||||||
|
|
||||||
|
var ring *metricsRing
|
||||||
|
var title, unit string
|
||||||
|
switch name {
|
||||||
|
case "cpu-temp":
|
||||||
|
ring, title, unit = h.ringCPUTemp, "CPU Temperature", "°C"
|
||||||
|
case "power":
|
||||||
|
ring, title, unit = h.ringPower, "System Power", "W"
|
||||||
|
default:
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
vals, labels := ring.snapshot()
|
||||||
|
if len(vals) == 0 {
|
||||||
|
vals = []float64{0}
|
||||||
|
labels = []string{""}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sparse x-axis labels
|
||||||
|
sparse := make([]string, len(labels))
|
||||||
|
step := len(labels) / 6
|
||||||
|
if step < 1 {
|
||||||
|
step = 1
|
||||||
|
}
|
||||||
|
for i := range labels {
|
||||||
|
if i%step == 0 {
|
||||||
|
sparse[i] = labels[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
opt := gocharts.NewLineChartOptionWithData([][]float64{vals})
|
||||||
|
opt.Title = gocharts.TitleOption{Text: title + " (" + unit + ")"}
|
||||||
|
opt.XAxis.Labels = sparse
|
||||||
|
opt.Legend = gocharts.LegendOption{Show: gocharts.Ptr(false)}
|
||||||
|
|
||||||
|
p := gocharts.NewPainter(gocharts.PainterOptions{
|
||||||
|
OutputFormat: gocharts.ChartOutputSVG,
|
||||||
|
Width: 600,
|
||||||
|
Height: 180,
|
||||||
|
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
|
||||||
|
if err := p.LineChart(opt); err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
buf, err := p.Bytes()
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "image/svg+xml")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_, _ = w.Write(buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Page handler ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
|
||||||
|
page := strings.TrimPrefix(r.URL.Path, "/")
|
||||||
|
if page == "" {
|
||||||
|
page = "dashboard"
|
||||||
|
}
|
||||||
|
body := renderPage(page, h.opts)
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
_, _ = w.Write([]byte(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Helpers ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func loadSnapshot(path string) ([]byte, error) {
|
||||||
|
if strings.TrimSpace(path) == "" {
|
||||||
|
return nil, os.ErrNotExist
|
||||||
|
}
|
||||||
|
return os.ReadFile(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeJSON sends v as JSON with status 200.
|
||||||
|
func writeJSON(w http.ResponseWriter, v any) {
|
||||||
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_ = json.NewEncoder(w).Encode(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeError sends a JSON error response.
|
||||||
|
func writeError(w http.ResponseWriter, status int, msg string) {
|
||||||
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.WriteHeader(status)
|
||||||
|
_ = json.NewEncoder(w).Encode(map[string]string{"error": msg})
|
||||||
|
}
|
||||||
167
audit/internal/webui/server_test.go
Normal file
167
audit/internal/webui/server_test.go
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRootRendersShellWithIframe(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-OLD"}}}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{
|
||||||
|
Title: "Bee Hardware Audit",
|
||||||
|
AuditPath: path,
|
||||||
|
ExportDir: exportDir,
|
||||||
|
})
|
||||||
|
|
||||||
|
first := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(first, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||||
|
if first.Code != http.StatusOK {
|
||||||
|
t.Fatalf("first status=%d", first.Code)
|
||||||
|
}
|
||||||
|
if !strings.Contains(first.Body.String(), `iframe`) || !strings.Contains(first.Body.String(), `src="/viewer"`) {
|
||||||
|
t.Fatalf("first body missing iframe viewer: %s", first.Body.String())
|
||||||
|
}
|
||||||
|
if !strings.Contains(first.Body.String(), "/export/support.tar.gz") {
|
||||||
|
t.Fatalf("first body missing support bundle link: %s", first.Body.String())
|
||||||
|
}
|
||||||
|
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
||||||
|
t.Fatalf("first cache-control=%q", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:05:00Z","hardware":{"board":{"serial_number":"SERIAL-NEW"}}}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
second := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(second, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||||
|
if second.Code != http.StatusOK {
|
||||||
|
t.Fatalf("second status=%d", second.Code)
|
||||||
|
}
|
||||||
|
if !strings.Contains(second.Body.String(), `src="/viewer"`) {
|
||||||
|
t.Fatalf("second body missing iframe viewer: %s", second.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-OLD"}}}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||||
|
first := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(first, httptest.NewRequest(http.MethodGet, "/viewer", nil))
|
||||||
|
if first.Code != http.StatusOK {
|
||||||
|
t.Fatalf("first status=%d", first.Code)
|
||||||
|
}
|
||||||
|
if !strings.Contains(first.Body.String(), "SERIAL-OLD") {
|
||||||
|
t.Fatalf("viewer body missing old serial: %s", first.Body.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:05:00Z","hardware":{"board":{"serial_number":"SERIAL-NEW"}}}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
second := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(second, httptest.NewRequest(http.MethodGet, "/viewer", nil))
|
||||||
|
if second.Code != http.StatusOK {
|
||||||
|
t.Fatalf("second status=%d", second.Code)
|
||||||
|
}
|
||||||
|
if !strings.Contains(second.Body.String(), "SERIAL-NEW") {
|
||||||
|
t.Fatalf("viewer body missing new serial: %s", second.Body.String())
|
||||||
|
}
|
||||||
|
if strings.Contains(second.Body.String(), "SERIAL-OLD") {
|
||||||
|
t.Fatalf("viewer body still contains old serial: %s", second.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
body := `{"hardware":{"board":{"serial_number":"SERIAL-API"}}}`
|
||||||
|
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit.json", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
if got := strings.TrimSpace(rec.Body.String()); got != body {
|
||||||
|
t.Fatalf("body=%q want %q", got, body)
|
||||||
|
}
|
||||||
|
if got := rec.Header().Get("Content-Type"); !strings.Contains(got, "application/json") {
|
||||||
|
t.Fatalf("content-type=%q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMissingAuditJSONReturnsNotFound(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: "/missing/audit.json"})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit.json", nil))
|
||||||
|
if rec.Code != http.StatusNotFound {
|
||||||
|
t.Fatalf("status=%d want %d", rec.Code, http.StatusNotFound)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSupportBundleEndpointReturnsArchive(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/export/support.tar.gz", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if got := rec.Header().Get("Content-Disposition"); !strings.Contains(got, "attachment;") {
|
||||||
|
t.Fatalf("content-disposition=%q", got)
|
||||||
|
}
|
||||||
|
if rec.Body.Len() == 0 {
|
||||||
|
t.Fatal("empty archive body")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
body := `{"status":"PARTIAL","checked_at":"2026-03-16T10:00:00Z"}`
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(body), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/runtime-health.json", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(rec.Body.String()) != body {
|
||||||
|
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -9,4 +9,5 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
|||||||
|---|---|
|
|---|---|
|
||||||
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
||||||
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
||||||
|
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||||
| `decisions/` | Architectural decision log |
|
| `decisions/` | Architectural decision log |
|
||||||
|
|||||||
@@ -4,100 +4,113 @@
|
|||||||
|
|
||||||
**The live CD runs in an isolated network segment with no internet access.**
|
**The live CD runs in an isolated network segment with no internet access.**
|
||||||
All binaries, kernel modules, and tools must be baked into the ISO at build time.
|
All binaries, kernel modules, and tools must be baked into the ISO at build time.
|
||||||
No `apk add`, no downloads, no package manager calls are allowed at boot.
|
No package installation, no downloads, and no package manager calls are allowed at boot.
|
||||||
DHCP is used only for LAN (operator SSH access). Internet is NOT available.
|
DHCP is used only for LAN (operator SSH access). Internet is NOT available.
|
||||||
|
|
||||||
## Boot sequence (single ISO)
|
## Boot sequence (single ISO)
|
||||||
|
|
||||||
OpenRC default runlevel, service start order:
|
The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
|
||||||
|
|
||||||
|
`systemd` boot order:
|
||||||
|
|
||||||
```
|
```
|
||||||
localmount
|
local-fs.target
|
||||||
├── bee-sshsetup (creates bee user, sets password; runs before dropbear)
|
├── bee-sshsetup.service (enables SSH key auth; password fallback only if marker exists)
|
||||||
│ └── dropbear (SSH on port 22 — starts without network)
|
│ └── ssh.service (OpenSSH on port 22 — starts without network)
|
||||||
├── bee-network (udhcpc -b on all physical interfaces, non-blocking)
|
├── bee-network.service (starts `dhclient -nw` on all physical interfaces, non-blocking)
|
||||||
│ └── bee-nvidia (insmod nvidia*.ko from /usr/local/lib/nvidia/,
|
├── bee-nvidia.service (insmod nvidia*.ko from /usr/local/lib/nvidia/,
|
||||||
│ creates libnvidia-ml.so.1 symlinks in /usr/lib/)
|
│ creates /dev/nvidia* nodes)
|
||||||
│ └── bee-audit (runs audit binary → /var/log/bee-audit.json)
|
├── bee-audit.service (runs `bee audit` → /var/log/bee-audit.json,
|
||||||
|
│ never blocks boot on partial collector failures)
|
||||||
|
├── bee-web.service (runs `bee web` on :80 — full interactive web UI)
|
||||||
|
└── bee-desktop.service (startx → openbox + chromium http://localhost/)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Critical invariants:**
|
**Critical invariants:**
|
||||||
- Dropbear MUST start without network. `bee-sshsetup` has `need localmount` only.
|
- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
|
||||||
- `bee-network` uses `udhcpc -b` (background) — retries indefinitely if no cable.
|
- OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
|
||||||
- `bee-nvidia` loads modules via `insmod` with absolute paths — NOT `modprobe`.
|
- `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
|
||||||
Reason: modloop squashfs mounts over `/lib/modules/<kver>/` at boot, making it
|
- `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
|
||||||
read-only. The overlay's modules at that path are inaccessible. Modules are stored
|
Reason: the modules are shipped in the ISO overlay under `/usr/local/lib/nvidia/`, not in the host module tree.
|
||||||
at `/usr/local/lib/nvidia/` (overlay path, always writable).
|
- `bee-audit.service` does not wait for `network-online.target`; audit is local and must run even if DHCP is broken.
|
||||||
- `bee-nvidia` creates `libnvidia-ml.so.1` symlinks in `/usr/lib/` — required because
|
- `bee-audit.service` logs audit failures but does not turn partial collector problems into a boot blocker.
|
||||||
`nvidia-smi` is a glibc binary that looks for the soname symlink, not the versioned file.
|
- `bee-web.service` binds `0.0.0.0:80` and always renders the current `/var/log/bee-audit.json` contents.
|
||||||
- `gcompat` package provides `/lib64/ld-linux-x86-64.so.2` for glibc compat on Alpine musl.
|
- Audit JSON now includes a `hardware.summary` block with overall verdict and warning/failure counts.
|
||||||
- `bee-audit` uses `after bee-nvidia` — ensures NVIDIA enrichment succeeds.
|
|
||||||
- `bee-audit` uses `eend 0` always — never fails boot even if audit errors.
|
## Console and login flow
|
||||||
|
|
||||||
|
Local-console behavior:
|
||||||
|
|
||||||
|
```text
|
||||||
|
tty1
|
||||||
|
└── live-config autologin → bee
|
||||||
|
└── /home/bee/.profile (prints web UI URLs)
|
||||||
|
|
||||||
|
display :0
|
||||||
|
└── bee-desktop.service (User=bee)
|
||||||
|
└── startx /usr/local/bin/bee-openbox-session -- :0
|
||||||
|
├── tint2 (taskbar)
|
||||||
|
├── chromium http://localhost/
|
||||||
|
└── openbox (WM)
|
||||||
|
```
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- local `tty1` lands in user `bee`, not directly in `root`
|
||||||
|
- `bee-desktop.service` starts X11 + openbox + Chromium automatically after `bee-web.service`
|
||||||
|
- Chromium opens `http://localhost/` — the full interactive web UI
|
||||||
|
- SSH is independent from the desktop path
|
||||||
|
- serial console support is enabled for VM boot debugging
|
||||||
|
|
||||||
## ISO build sequence
|
## ISO build sequence
|
||||||
|
|
||||||
```
|
```
|
||||||
build.sh [--authorized-keys /path/to/keys]
|
build-in-container.sh [--authorized-keys /path/to/keys]
|
||||||
1. compile audit binary (skip if .go files older than binary)
|
1. compile `bee` binary (skip if .go files older than binary)
|
||||||
2. inject authorized_keys into overlay/root/.ssh/ (or set password fallback)
|
2. create a temporary overlay staging dir under `dist/`
|
||||||
3. copy audit binary → overlay/usr/local/bin/audit
|
3. inject authorized_keys into staged `root/.ssh/` (or set password fallback marker)
|
||||||
4. copy vendor binaries from iso/vendor/ → overlay/usr/local/bin/
|
4. copy `bee` binary → staged `/usr/local/bin/bee`
|
||||||
(storcli64, sas2ircu, sas3ircu, mstflint, gpu_burn — each optional)
|
5. copy vendor binaries from `iso/vendor/` → staged `/usr/local/bin/`
|
||||||
5. build-nvidia-module.sh:
|
(`storcli64`, `sas2ircu`, `sas3ircu`, `arcconf`, `ssacli` — optional; `mstflint` comes from the Debian package set)
|
||||||
a. apk add linux-lts-dev (always, to get current Alpine 3.21 kernel headers)
|
6. `build-nvidia-module.sh`:
|
||||||
b. detect KVER from /usr/src/linux-headers-*
|
a. install Debian kernel headers if missing
|
||||||
c. download NVIDIA .run installer (sha256 verified, cached in dist/)
|
b. download NVIDIA `.run` installer (sha256 verified, cached in `dist/`)
|
||||||
d. extract installer
|
c. extract installer
|
||||||
e. build kernel modules against linux-lts headers
|
d. build kernel modules against Debian headers
|
||||||
f. create libnvidia-ml.so.1 / libcuda.so.1 symlinks in cache
|
e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
|
||||||
g. cache in dist/nvidia-<version>-<kver>/
|
f. cache in `dist/nvidia-<version>-<kver>/`
|
||||||
6. inject NVIDIA .ko → overlay/usr/local/lib/nvidia/
|
7. `build-cublas.sh`:
|
||||||
7. inject nvidia-smi → overlay/usr/local/bin/nvidia-smi
|
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
|
||||||
8. inject libnvidia-ml + libcuda → overlay/usr/lib/
|
b. verify packages against repo `Packages.gz`
|
||||||
9. write overlay/etc/bee-release (versions + git commit)
|
c. extract headers for `bee-gpu-stress` build
|
||||||
10. export BEE_BUILD_INFO for motd substitution
|
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
|
||||||
11. mkimage.sh (from /var/tmp, TMPDIR=/var/tmp):
|
8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
|
||||||
kernel_* section — cached (linux-lts modloop)
|
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||||
apks_* section — cached (downloaded packages)
|
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||||
syslinux_* / grub_* — cached
|
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
|
||||||
apkovl — always regenerated (genapkovl-bee.sh)
|
12. write staged `/etc/bee-release` (versions + git commit)
|
||||||
final ISO — always assembled
|
13. patch staged `motd` with build metadata
|
||||||
|
14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
|
||||||
|
15. sync staged overlay into workdir `config/includes.chroot/`
|
||||||
|
16. run `lb config && lb build` inside the privileged builder container
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Build host notes:
|
||||||
|
- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
|
||||||
|
- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
|
||||||
|
- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
|
||||||
|
|
||||||
**Critical invariants:**
|
**Critical invariants:**
|
||||||
- `KERNEL_PKG_VERSION` in `iso/builder/VERSIONS` pins the exact Alpine package version
|
- `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
|
||||||
(e.g. `6.12.76-r0`). This version is used in THREE places that MUST stay in sync:
|
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
||||||
1. `build-nvidia-module.sh` — `apk add linux-lts-dev=${KERNEL_PKG_VERSION}` (compile headers)
|
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
||||||
2. `mkimg.bee.sh` — `linux-lts=${KERNEL_PKG_VERSION}` in apks list (ISO kernel)
|
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
||||||
3. `build.sh` — build-time verification that headers match pin (fails loudly if not)
|
- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
||||||
When Alpine releases a new linux-lts patch (e.g. r0 → r1), update KERNEL_PKG_VERSION
|
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
|
||||||
in VERSIONS — that's the only place to change. The build will fail loudly if the pin
|
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
||||||
doesn't match the installed headers, so stale pins are caught immediately.
|
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
||||||
- **All three must use the same APK mirror: `dl-cdn.alpinelinux.org`.** Both
|
- Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
|
||||||
`build-nvidia-module.sh` (apk add) and `mkimage.sh` (--repository) explicitly use
|
- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
|
||||||
`https://dl-cdn.alpinelinux.org/alpine/v${ALPINE_VERSION}/main|community`.
|
- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.
|
||||||
Never use the builder's local `/etc/apk/repositories` — its mirror may serve
|
|
||||||
a different package state, causing "unable to select package" failures.
|
|
||||||
- `linux-lts-dev` is always installed (not conditional) — stale 6.6.x headers on the
|
|
||||||
builder would cause modules to be built for the wrong kernel and never load at runtime.
|
|
||||||
- NVIDIA modules go to `overlay/usr/local/lib/nvidia/` — NOT `lib/modules/<kver>/extra/`.
|
|
||||||
- `genapkovl-bee.sh` must be copied to `/var/tmp/` (CWD when mkimage runs).
|
|
||||||
- `TMPDIR=/var/tmp` required — tmpfs `/tmp` is only ~1GB, too small for kernel firmware.
|
|
||||||
- Workdir cleanup preserves `apks_*`, `kernel_*`, `syslinux_*`, `grub_*` cache dirs.
|
|
||||||
|
|
||||||
## gpu_burn vendor binary
|
|
||||||
|
|
||||||
`gpu_burn` requires CUDA nvcc to build. It is NOT built as part of the main ISO build.
|
|
||||||
Build separately on the builder VM and place in `iso/vendor/gpu_burn`:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
sh iso/builder/build-gpu-burn.sh dist/
|
|
||||||
cp dist/gpu_burn iso/vendor/gpu_burn
|
|
||||||
cp dist/compare.ptx iso/vendor/compare.ptx
|
|
||||||
```
|
|
||||||
|
|
||||||
Requires: CUDA 12.8+ (supports GCC 14, Alpine 3.21), libxml2, g++, make, git.
|
|
||||||
The `build.sh` will include it automatically if `iso/vendor/gpu_burn` exists.
|
|
||||||
|
|
||||||
## Post-boot smoke test
|
## Post-boot smoke test
|
||||||
|
|
||||||
@@ -109,35 +122,79 @@ ssh root@<ip> 'sh -s' < iso/builder/smoketest.sh
|
|||||||
|
|
||||||
Exit code 0 = all required checks pass. All `FAIL` lines must be zero before shipping.
|
Exit code 0 = all required checks pass. All `FAIL` lines must be zero before shipping.
|
||||||
|
|
||||||
Key checks: NVIDIA modules loaded, nvidia-smi sees all GPUs, lib symlinks present,
|
Key checks: NVIDIA modules loaded, `nvidia-smi` sees all GPUs, lib symlinks present,
|
||||||
gcompat installed, services running, audit completed with NVIDIA enrichment, internet.
|
systemd services running, audit completed with NVIDIA enrichment, LAN reachability.
|
||||||
|
|
||||||
## apkovl mechanism
|
Current validation state:
|
||||||
|
- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and TUI startup
|
||||||
|
- real hardware validation is still required before treating the ISO as release-ready
|
||||||
|
|
||||||
The apkovl is a `.tar.gz` injected into the ISO at `/boot/`. Alpine initramfs extracts
|
## Overlay mechanism
|
||||||
it at boot, overlaying `/etc`, `/usr`, `/root`, `/lib` on the tmpfs root.
|
|
||||||
|
|
||||||
`genapkovl-bee.sh` generates the tarball containing:
|
`live-build` copies files from `config/includes.chroot/` into the ISO filesystem.
|
||||||
- `/etc/apk/world` — package list (apk installs on first boot)
|
`build.sh` prepares a staged overlay, then syncs it into a temporary workdir's
|
||||||
- `/etc/runlevels/*/` — OpenRC service symlinks
|
`config/includes.chroot/` before running `lb build`.
|
||||||
- `/etc/conf.d/dropbear` — `DROPBEAR_OPTS="-R -B"`
|
|
||||||
- `/etc/network/interfaces` — lo only (bee-network handles DHCP)
|
|
||||||
- `/etc/hostname`
|
|
||||||
- Everything from `iso/overlay/` (init scripts, binaries, ssh keys, tui)
|
|
||||||
|
|
||||||
## Collector flow
|
## Collector flow
|
||||||
|
|
||||||
```
|
```
|
||||||
audit binary start
|
`bee audit` start
|
||||||
1. board collector (dmidecode -t 0,1,2)
|
1. board collector (dmidecode -t 0,1,2)
|
||||||
2. cpu collector (dmidecode -t 4)
|
2. cpu collector (dmidecode -t 4)
|
||||||
3. memory collector (dmidecode -t 17)
|
3. memory collector (dmidecode -t 17)
|
||||||
4. storage collector (lsblk -J, smartctl -j, nvme id-ctrl, nvme smart-log)
|
4. storage collector (lsblk -J, smartctl -j, nvme id-ctrl, nvme smart-log)
|
||||||
5. pcie collector (lspci -vmm -D, /sys/bus/pci/devices/)
|
5. pcie collector (lspci -vmm -D, /sys/bus/pci/devices/)
|
||||||
6. psu collector (ipmitool fru — silent if no /dev/ipmi0)
|
6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
|
||||||
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
|
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
|
||||||
8. output JSON → /var/log/bee-audit.json
|
8. output JSON → /var/log/bee-audit.json
|
||||||
9. QR summary to stdout (qrencode if available)
|
9. QR summary to stdout (qrencode if available)
|
||||||
```
|
```
|
||||||
|
|
||||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||||
|
|
||||||
|
Acceptance flows:
|
||||||
|
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
|
||||||
|
- `bee sat memory` → `memtester` archive
|
||||||
|
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
||||||
|
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
||||||
|
- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
||||||
|
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
|
||||||
|
- Ada / Hopper: add `fp8`
|
||||||
|
- Blackwell+: add `fp4`
|
||||||
|
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
|
||||||
|
- Runtime overrides:
|
||||||
|
- `BEE_GPU_STRESS_SECONDS`
|
||||||
|
- `BEE_GPU_STRESS_SIZE_MB`
|
||||||
|
- `BEE_MEMTESTER_SIZE_MB`
|
||||||
|
- `BEE_MEMTESTER_PASSES`
|
||||||
|
|
||||||
|
## NVIDIA SAT TUI flow (v1.0.0+)
|
||||||
|
|
||||||
|
```
|
||||||
|
TUI: Acceptance tests → NVIDIA command pack
|
||||||
|
1. screenNvidiaSATSetup
|
||||||
|
a. enumerate GPUs via `nvidia-smi --query-gpu=index,name,memory.total`
|
||||||
|
b. user selects duration preset: 10 min / 1 h / 8 h / 24 h
|
||||||
|
c. user selects GPUs via checkboxes (all selected by default)
|
||||||
|
d. memory size = max(selected GPU memory) — auto-detected, not exposed to user
|
||||||
|
2. Start → screenNvidiaSATRunning
|
||||||
|
a. CUDA_VISIBLE_DEVICES set to selected GPU indices
|
||||||
|
b. tea.Batch: SAT goroutine + tea.ExecProcess(nvtop) launched concurrently
|
||||||
|
c. nvtop occupies full terminal; SAT result queues in background
|
||||||
|
d. [o] reopen nvtop at any time; [a] abort (cancels context → kills bee-gpu-stress)
|
||||||
|
3. GPU metrics collection (during bee-gpu-stress)
|
||||||
|
- background goroutine polls `nvidia-smi` every second
|
||||||
|
- per-second rows: elapsed, GPU index, temp°C, usage%, power W, clock MHz
|
||||||
|
- outputs: gpu-metrics.csv, gpu-metrics.html (offline SVG chart), gpu-metrics-term.txt
|
||||||
|
4. After SAT completes
|
||||||
|
- result shown in screenOutput with terminal line-chart (gpu-metrics-term.txt)
|
||||||
|
- chart is asciigraph-style: box-drawing chars (╭╮╰╯─│), 4 series per GPU,
|
||||||
|
Y axis with ticks, ANSI colours (red=temp, blue=usage, green=power, yellow=clock)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Critical invariants:**
|
||||||
|
- `nvtop` must be in `iso/builder/config/package-lists/bee.list.chroot` (baked into ISO).
|
||||||
|
- `bee-gpu-stress` uses `exec.CommandContext` — aborted on cancel.
|
||||||
|
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
||||||
|
- If `nvtop` is not found on PATH, SAT still runs without it (graceful degradation).
|
||||||
|
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
Hardware audit LiveCD. Boots on a server via BMC virtual media or USB.
|
Hardware audit LiveCD. Boots on a server via BMC virtual media or USB.
|
||||||
Collects hardware inventory at OS level (not through BMC/Redfish).
|
Collects hardware inventory at OS level (not through BMC/Redfish).
|
||||||
Produces `HardwareIngestRequest` JSON compatible with core/reanimator.
|
Produces `HardwareIngestRequest` JSON compatible with the contract in `bible-local/docs/hardware-ingest-contract.md`.
|
||||||
|
|
||||||
## Why it exists
|
## Why it exists
|
||||||
|
|
||||||
@@ -19,18 +19,23 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
## In scope
|
## In scope
|
||||||
|
|
||||||
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
||||||
- Unattended operation — no user interaction required
|
- Machine-readable health summary derived from collector verdicts
|
||||||
|
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
||||||
|
- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
|
||||||
|
- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
||||||
|
- Automatic boot audit with operator-facing local console and SSH access
|
||||||
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
||||||
- SSH access (dropbear) always available for inspection and debugging
|
- SSH access (OpenSSH) always available for inspection and debugging
|
||||||
- Interactive TUI (`bee-tui`) for network setup, service management, GPU tests
|
- Full web UI via `bee web` on port 80: interactive control panel with live metrics, SAT tests, network config, service management, export, and tools
|
||||||
- GPU stress testing via `gpu_burn` (vendor binary, optional)
|
- Local operator desktop: openbox + Xorg + Chromium auto-opening `http://localhost/`
|
||||||
|
- Local `tty1` operator UX: `bee` autologin, openbox desktop auto-starts with Chromium on `http://localhost/`
|
||||||
|
|
||||||
## Network isolation — CRITICAL
|
## Network isolation — CRITICAL
|
||||||
|
|
||||||
**The live CD runs in an isolated network segment with no internet access.**
|
**The live CD runs in an isolated network segment with no internet access.**
|
||||||
|
|
||||||
- All tools, drivers, and binaries MUST be pre-baked into the ISO at build time
|
- All tools, drivers, and binaries MUST be pre-baked into the ISO at build time
|
||||||
- No `apk add` at boot — packages are installed during ISO creation, not at runtime
|
- No package installation at boot — packages are installed during ISO creation, not at runtime
|
||||||
- No downloads at boot — NVIDIA modules, vendor tools, and all binaries come from the ISO overlay
|
- No downloads at boot — NVIDIA modules, vendor tools, and all binaries come from the ISO overlay
|
||||||
- DHCP is used only for LAN access (SSH from operator laptop); internet is NOT assumed
|
- DHCP is used only for LAN access (SSH from operator laptop); internet is NOT assumed
|
||||||
- Any feature requiring network downloads cannot be added to the live CD
|
- Any feature requiring network downloads cannot be added to the live CD
|
||||||
@@ -43,32 +48,66 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
- Anything requiring persistent storage on the audited machine
|
- Anything requiring persistent storage on the audited machine
|
||||||
- Windows support
|
- Windows support
|
||||||
- Any functionality requiring internet access at boot
|
- Any functionality requiring internet access at boot
|
||||||
|
- Component lifecycle/history across multiple snapshots
|
||||||
|
- Status transition history (`status_history`, `status_changed_at`) derived from previous exports
|
||||||
|
- Replacement detection between two or more audit runs
|
||||||
|
|
||||||
|
## Contract boundary
|
||||||
|
|
||||||
|
- `bee` is responsible for the current hardware snapshot only.
|
||||||
|
- `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
|
||||||
|
- Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
|
||||||
|
- Contract fields that have no honest local source on a generic Linux host may remain empty.
|
||||||
|
|
||||||
## Tech stack
|
## Tech stack
|
||||||
|
|
||||||
| Component | Technology |
|
| Component | Technology |
|
||||||
|---|---|
|
|---|---|
|
||||||
| Audit binary | Go, static, `CGO_ENABLED=0` |
|
| Audit binary | Go, static, `CGO_ENABLED=0` |
|
||||||
| LiveCD | Alpine Linux 3.21, linux-lts 6.12.x |
|
| Live ISO | Debian 12 (bookworm), amd64 live-build image |
|
||||||
| ISO build | Alpine mkimage + apkovl overlay (`iso/overlay/`) |
|
| ISO build | Debian `live-build` + overlay sync into `config/includes.chroot/` |
|
||||||
| Init system | OpenRC |
|
| Init system | `systemd` |
|
||||||
| SSH | Dropbear (always included) |
|
| SSH | OpenSSH server |
|
||||||
| NVIDIA driver | Proprietary `.run` installer, built against linux-lts headers |
|
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
||||||
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` (not modloop path) |
|
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
||||||
| glibc compat | `gcompat` — required for `nvidia-smi` (glibc binary on musl Alpine) |
|
| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
||||||
| Builder VM | Alpine 3.21 |
|
| Builder | Debian 12 host/VM or Debian 12 container image |
|
||||||
|
|
||||||
|
## Operator UX
|
||||||
|
|
||||||
|
- On the live ISO, `tty1` autologins as `bee`
|
||||||
|
- `bee-desktop.service` starts X11 + openbox + Chromium on display `:0`
|
||||||
|
- Chromium opens `http://localhost/` — the full web UI
|
||||||
|
- SSH remains available independently of the local console path
|
||||||
|
- Remote operators can open `http://<ip>/` in any browser on the same LAN
|
||||||
|
- VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
|
||||||
|
- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries
|
||||||
|
|
||||||
|
## Runtime split
|
||||||
|
|
||||||
|
- The main Go application must run both on a normal Linux host and inside the live ISO
|
||||||
|
- Live-ISO-only responsibilities stay in `iso/` integration code
|
||||||
|
- Live ISO launches the Go CLI with `--runtime livecd`
|
||||||
|
- Local/manual runs use `--runtime auto` or `--runtime local`
|
||||||
|
- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup
|
||||||
|
|
||||||
## Key paths
|
## Key paths
|
||||||
|
|
||||||
| Path | Purpose |
|
| Path | Purpose |
|
||||||
|---|---|
|
|---|---|
|
||||||
| `audit/cmd/audit/` | CLI entry point |
|
| `audit/cmd/bee/` | Main CLI entry point |
|
||||||
| `audit/internal/collector/` | Per-subsystem collectors |
|
| `audit/internal/collector/` | Per-subsystem collectors |
|
||||||
| `audit/internal/schema/` | HardwareIngestRequest types |
|
| `audit/internal/schema/` | HardwareIngestRequest types |
|
||||||
| `iso/builder/` | ISO build scripts and mkimage profile |
|
| `iso/builder/` | ISO build scripts and `live-build` profile |
|
||||||
| `iso/overlay/` | Single overlay: files injected into ISO via apkovl |
|
| `iso/overlay/` | Source overlay copied into a staged build overlay |
|
||||||
| `iso/vendor/` | Optional pre-built vendor binaries (storcli64, gpu_burn, …) |
|
| `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, arcconf, ssacli, …) |
|
||||||
| `iso/builder/VERSIONS` | Pinned versions: Alpine, Go, NVIDIA driver, kernel |
|
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
|
||||||
|
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
|
||||||
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
||||||
|
| `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
|
||||||
|
| `iso/overlay/home/bee/.profile` | `bee` shell profile (PATH only) |
|
||||||
|
| `iso/overlay/etc/systemd/system/bee-desktop.service` | starts X11 + openbox + chromium |
|
||||||
|
| `iso/overlay/usr/local/bin/bee-desktop` | startx wrapper for bee-desktop.service |
|
||||||
|
| `iso/overlay/usr/local/bin/bee-openbox-session` | xinitrc: tint2 + chromium + openbox |
|
||||||
| `dist/` | Build outputs (gitignored) |
|
| `dist/` | Build outputs (gitignored) |
|
||||||
| `iso/out/` | Downloaded ISO files (gitignored) |
|
| `iso/out/` | Downloaded ISO files (gitignored) |
|
||||||
|
|||||||
@@ -1,21 +1,89 @@
|
|||||||
# Backlog
|
# Backlog
|
||||||
|
|
||||||
## GPU stress test (H100)
|
## BMC версия через IPMI
|
||||||
|
|
||||||
**Задача:** добавить GPU burn/stress тест в bee-tui без существенного увеличения ISO.
|
**Статус:** реализовано.
|
||||||
|
|
||||||
**Контекст:**
|
Добавить сбор версии BMC firmware в board collector:
|
||||||
- `gpu_burn` (wilicc/gpu-burn) не подходит — требует `libcublas.so` (~500MB), что раздует ISO кратно
|
- Команда: `ipmitool mc info` → поле `Firmware Revision`
|
||||||
- `libcuda.so` уже есть в ISO (из NVIDIA .run installer)
|
- Записывать в `hardware.firmware[]` как `{device_name: "BMC", version: "..."}`
|
||||||
|
- Показывать в TUI правой колонке рядом с BIOS версией
|
||||||
|
- Graceful skip если `/dev/ipmi0` отсутствует (silent: same pattern as PSU collector)
|
||||||
|
|
||||||
**Выбранный подход:** написать минимальный стресс-тул на CUDA Driver API
|
## CPU acceptance test через stress-ng
|
||||||
- Использует только `libcuda.so` (уже в ISO) — никаких новых зависимостей
|
|
||||||
- Реализует матричное умножение или memory bandwidth через `cuLaunchKernel`
|
|
||||||
- Бинарь ~100KB, компилируется через `nvcc` на builder VM, кладётся в `iso/vendor/`
|
|
||||||
- bee-tui вызывает его вместо `gpu_burn`
|
|
||||||
|
|
||||||
**Отклонённые варианты:**
|
**Статус:** реализовано. CPU в Health Check получает PASS/FAIL из summary.txt.
|
||||||
- `gpu_burn` — нужен libcublas (~500MB)
|
|
||||||
- `nvbandwidth` — только bandwidth, не жжёт FLOPs; нужен libcudart (~8MB)
|
Добавить CPU SAT на базе `stress-ng`:
|
||||||
- DCGM diag — правильный инструмент для H100 но ~100MB установка
|
- Bake `stress-ng` в ISO (добавить в `bee.list.chroot`)
|
||||||
- Download on demand — нужен libcublas, проблема та же
|
- Новый `bee sat cpu` — запускает `stress-ng --cpu 0 --cpu-method all --timeout <N>` где N = duration из режима (Quick=60s, Standard=300s, Express=900s)
|
||||||
|
- Параллельно снимает температуры через `sensors` и throttle-флаги из аудит JSON
|
||||||
|
- Результат: SAT архив с summary.txt в формате других SAT (overall_status=OK/FAILED)
|
||||||
|
- После реализации: CPU в Health Check получает реальный PASS/FAIL статус
|
||||||
|
|
||||||
|
## Real hardware validation
|
||||||
|
|
||||||
|
**Статус:** ожидает доступа к железу.
|
||||||
|
|
||||||
|
Что осталось подтвердить на практике:
|
||||||
|
- `bee sat nvidia` на реальном NVIDIA GPU host
|
||||||
|
- `bee sat storage` на NVMe/SATA/RAID host
|
||||||
|
- `ipmitool sdr` parsing на сервере с реальным BMC/IPMI
|
||||||
|
- vendor RAID tooling (`storcli64`, `sas2ircu`, `sas3ircu`, `arcconf`, `ssacli`) в живом ISO
|
||||||
|
|
||||||
|
## SAT result polish
|
||||||
|
|
||||||
|
**Статус:** частично закрыто.
|
||||||
|
|
||||||
|
Что ещё можно улучшить после полевой проверки:
|
||||||
|
- точнее классифицировать vendor-specific self-test outputs в `storage SAT`
|
||||||
|
- подобрать дефолты `memtester` по объёму RAM на целевых машинах
|
||||||
|
- при необходимости расширить `bee-gpu-stress` по длительности/нагрузке
|
||||||
|
|
||||||
|
## Hardware Contract backlog
|
||||||
|
|
||||||
|
**Статус:** уточнён, сокращён до `bee`-only snapshot scope.
|
||||||
|
|
||||||
|
### Не backlog для `bee`
|
||||||
|
|
||||||
|
Эти задачи не должны реализовываться в `bee`, потому что относятся к централизованному ingest/lifecycle слою:
|
||||||
|
- `status_history`
|
||||||
|
- `status_changed_at`
|
||||||
|
- определение замены компонента между snapshot'ами
|
||||||
|
- timeline/lifecycle/history по diff между экспортами
|
||||||
|
|
||||||
|
`bee` отвечает только за текущий snapshot железа и `status_checked_at`.
|
||||||
|
|
||||||
|
### Реализуемо инкрементально
|
||||||
|
|
||||||
|
Эти поля можно развивать дальше по мере появления реальных sample outputs и vendor-specific parser'ов:
|
||||||
|
- `cpus.correctable_error_count`
|
||||||
|
- `cpus.uncorrectable_error_count`
|
||||||
|
- `power_supplies.life_remaining_pct`
|
||||||
|
- `power_supplies.life_used_pct`
|
||||||
|
- `pcie_devices.battery_charge_pct`
|
||||||
|
- `pcie_devices.battery_health_pct`
|
||||||
|
- `pcie_devices.battery_temperature_c`
|
||||||
|
- `pcie_devices.battery_voltage_v`
|
||||||
|
- `pcie_devices.battery_replace_required`
|
||||||
|
|
||||||
|
### Vendor/platform-specific, часто пустые
|
||||||
|
|
||||||
|
Эти поля допустимо оставлять пустыми на части платформ даже после реализации parser'ов:
|
||||||
|
- `power_supplies.life_remaining_pct`
|
||||||
|
- `power_supplies.life_used_pct`
|
||||||
|
- часть `pcie_devices.battery_*` для неподдержанных RAID/NIC/GPU вендоров
|
||||||
|
|
||||||
|
### Unsupported в `bee`
|
||||||
|
|
||||||
|
Эти поля считаются нереалистичными для общего OS-level hardware snapshotter без synthetic/fake data:
|
||||||
|
- `cpus.life_remaining_pct`
|
||||||
|
- `cpus.life_used_pct`
|
||||||
|
- `memory.life_remaining_pct`
|
||||||
|
- `memory.life_used_pct`
|
||||||
|
- `memory.spare_blocks_remaining_pct`
|
||||||
|
- `memory.performance_degraded`
|
||||||
|
|
||||||
|
Причина: у обычного Linux-host audit обычно нет честного vendor-neutral runtime source для этих метрик.
|
||||||
|
|
||||||
|
Эти поля считаются дропнутыми из backlog `bee` и не должны возвращаться в план работ без появления нового доказуемого локального источника данных на целевых машинах.
|
||||||
|
|||||||
793
bible-local/docs/hardware-ingest-contract.md
Normal file
793
bible-local/docs/hardware-ingest-contract.md
Normal file
@@ -0,0 +1,793 @@
|
|||||||
|
---
|
||||||
|
title: Hardware Ingest JSON Contract
|
||||||
|
version: "2.7"
|
||||||
|
updated: "2026-03-15"
|
||||||
|
maintainer: Reanimator Core
|
||||||
|
audience: external-integrators, ai-agents
|
||||||
|
language: ru
|
||||||
|
---
|
||||||
|
|
||||||
|
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
|
||||||
|
|
||||||
|
Версия: **2.7** · Дата: **2026-03-15**
|
||||||
|
|
||||||
|
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
|
||||||
|
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
|
||||||
|
|
||||||
|
> Актуальная версия документа: https://git.mchus.pro/reanimator/core/src/branch/main/bible-local/docs/hardware-ingest-contract.md
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Changelog
|
||||||
|
|
||||||
|
| Версия | Дата | Изменения |
|
||||||
|
|--------|------|-----------|
|
||||||
|
| 2.7 | 2026-03-15 | Явно запрещён синтез данных в `event_logs`; интеграторы не должны придумывать серийные номера компонентов, если источник их не отдал |
|
||||||
|
| 2.6 | 2026-03-15 | Добавлена необязательная секция `event_logs` для dedup/upsert логов `host` / `bmc` / `redfish` вне history timeline |
|
||||||
|
| 2.5 | 2026-03-15 | Добавлено общее необязательное поле `manufactured_year_week` для компонентных секций (`YYYY-Www`) |
|
||||||
|
| 2.4 | 2026-03-15 | Добавлена первая волна component telemetry: health/life поля для `cpus`, `memory`, `storage`, `pcie_devices`, `power_supplies` |
|
||||||
|
| 2.3 | 2026-03-15 | Добавлены component telemetry поля: `pcie_devices.temperature_c`, `pcie_devices.power_w`, `power_supplies.temperature_c` |
|
||||||
|
| 2.2 | 2026-03-15 | Добавлено поле `numa_node` у `pcie_devices` для topology/affinity |
|
||||||
|
| 2.1 | 2026-03-15 | Добавлена секция `sensors` (fans, power, temperatures, other); поле `mac_addresses` у `pcie_devices`; расширен список значений `device_class` |
|
||||||
|
| 2.0 | 2026-02-01 | История статусов (`status_history`, `status_changed_at`); поля telemetry у PSU; async job response |
|
||||||
|
| 1.0 | 2026-01-01 | Начальная версия контракта |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Принципы
|
||||||
|
|
||||||
|
1. **Snapshot** — JSON описывает состояние сервера на момент сбора. Может включать историю изменений статуса компонентов.
|
||||||
|
2. **Идемпотентность** — повторная отправка идентичного payload не создаёт дублей (дедупликация по хешу).
|
||||||
|
3. **Частичность** — можно передавать только те секции, данные по которым доступны. Пустой массив и отсутствие секции эквивалентны.
|
||||||
|
4. **Строгая схема** — endpoint использует строгий JSON-декодер; неизвестные поля приводят к `400 Bad Request`.
|
||||||
|
5. **Event-driven** — импорт создаёт события в timeline (LOG_COLLECTED, INSTALLED, REMOVED, FIRMWARE_CHANGED и др.).
|
||||||
|
6. **Без синтеза со стороны интегратора** — сборщик передаёт только фактически собранные значения. Нельзя придумывать `serial_number`, `component_ref`, `message`, `message_id` или другие идентификаторы/атрибуты, если источник их не предоставил или парсер не смог их надёжно извлечь.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Endpoint
|
||||||
|
|
||||||
|
```
|
||||||
|
POST /ingest/hardware
|
||||||
|
Content-Type: application/json
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ответ при приёме (202 Accepted):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "accepted",
|
||||||
|
"job_id": "job_01J..."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Импорт выполняется асинхронно. Результат доступен по:
|
||||||
|
```
|
||||||
|
GET /ingest/hardware/jobs/{job_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ответ при успехе задачи:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "success",
|
||||||
|
"bundle_id": "lb_01J...",
|
||||||
|
"asset_id": "mach_01J...",
|
||||||
|
"collected_at": "2026-02-10T15:30:00Z",
|
||||||
|
"duplicate": false,
|
||||||
|
"summary": {
|
||||||
|
"parts_observed": 15,
|
||||||
|
"parts_created": 2,
|
||||||
|
"parts_updated": 13,
|
||||||
|
"installations_created": 2,
|
||||||
|
"installations_closed": 1,
|
||||||
|
"timeline_events_created": 9,
|
||||||
|
"failure_events_created": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ответ при дубликате:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "success",
|
||||||
|
"duplicate": true,
|
||||||
|
"message": "LogBundle with this content hash already exists"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ответ при ошибке (400 Bad Request):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "error",
|
||||||
|
"error": "validation_failed",
|
||||||
|
"details": {
|
||||||
|
"field": "hardware.board.serial_number",
|
||||||
|
"message": "serial_number is required"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Частые причины `400`:
|
||||||
|
- Неверный формат `collected_at` (требуется RFC3339).
|
||||||
|
- Пустой `hardware.board.serial_number`.
|
||||||
|
- Наличие неизвестного JSON-поля на любом уровне.
|
||||||
|
- Тело запроса превышает допустимый размер.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Структура верхнего уровня
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"filename": "redfish://10.10.10.103",
|
||||||
|
"source_type": "api",
|
||||||
|
"protocol": "redfish",
|
||||||
|
"target_host": "10.10.10.103",
|
||||||
|
"collected_at": "2026-02-10T15:30:00Z",
|
||||||
|
"hardware": {
|
||||||
|
"board": { ... },
|
||||||
|
"firmware": [ ... ],
|
||||||
|
"cpus": [ ... ],
|
||||||
|
"memory": [ ... ],
|
||||||
|
"storage": [ ... ],
|
||||||
|
"pcie_devices": [ ... ],
|
||||||
|
"power_supplies": [ ... ],
|
||||||
|
"sensors": { ... },
|
||||||
|
"event_logs": [ ... ]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Поля верхнего уровня
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `collected_at` | string RFC3339 | **да** | Время сбора данных |
|
||||||
|
| `hardware` | object | **да** | Аппаратный снапшот |
|
||||||
|
| `hardware.board.serial_number` | string | **да** | Серийный номер платы/сервера |
|
||||||
|
| `target_host` | string | нет | IP или hostname |
|
||||||
|
| `source_type` | string | нет | Тип источника: `api`, `logfile`, `manual` |
|
||||||
|
| `protocol` | string | нет | Протокол: `redfish`, `ipmi`, `snmp`, `ssh` |
|
||||||
|
| `filename` | string | нет | Идентификатор источника |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Общие поля статуса компонентов
|
||||||
|
|
||||||
|
Применяются ко всем компонентным секциям (`cpus`, `memory`, `storage`, `pcie_devices`, `power_supplies`).
|
||||||
|
|
||||||
|
| Поле | Тип | Описание |
|
||||||
|
|------|-----|----------|
|
||||||
|
| `status` | string | Текущий статус: `OK`, `Warning`, `Critical`, `Unknown`, `Empty` |
|
||||||
|
| `status_checked_at` | string RFC3339 | Время последней проверки статуса |
|
||||||
|
| `status_changed_at` | string RFC3339 | Время последнего изменения статуса |
|
||||||
|
| `status_history` | array | История переходов статусов (см. ниже) |
|
||||||
|
| `error_description` | string | Текст ошибки/диагностики |
|
||||||
|
| `manufactured_year_week` | string | Дата производства в формате `YYYY-Www`, например `2024-W07` |
|
||||||
|
|
||||||
|
**Объект `status_history[]`:**
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `status` | string | **да** | Статус в этот момент |
|
||||||
|
| `changed_at` | string RFC3339 | **да** | Время перехода (без этого поля запись игнорируется) |
|
||||||
|
| `details` | string | нет | Пояснение к переходу |
|
||||||
|
|
||||||
|
**Правила приоритета времени события:**
|
||||||
|
|
||||||
|
1. `status_changed_at`
|
||||||
|
2. Последняя запись `status_history` с совпадающим статусом
|
||||||
|
3. Последняя парсируемая запись `status_history`
|
||||||
|
4. `status_checked_at`
|
||||||
|
|
||||||
|
**Правила передачи статусов:**
|
||||||
|
- Передавайте `status` как текущее состояние компонента в snapshot.
|
||||||
|
- Если источник хранит историю — передавайте `status_history` отсортированным по `changed_at` по возрастанию.
|
||||||
|
- Не включайте записи `status_history` без `changed_at`.
|
||||||
|
- Все даты — RFC3339, рекомендуется UTC (`Z`).
|
||||||
|
- `manufactured_year_week` используйте, когда источник знает только год и неделю производства, без точной календарной даты.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Секции hardware
|
||||||
|
|
||||||
|
### board
|
||||||
|
|
||||||
|
Основная информация о сервере. Обязательная секция.
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `serial_number` | string | **да** | Серийный номер (ключ идентификации Asset) |
|
||||||
|
| `manufacturer` | string | нет | Производитель |
|
||||||
|
| `product_name` | string | нет | Модель |
|
||||||
|
| `part_number` | string | нет | Партномер |
|
||||||
|
| `uuid` | string | нет | UUID системы |
|
||||||
|
|
||||||
|
Значения `"NULL"` в строковых полях трактуются как отсутствие данных.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"board": {
|
||||||
|
"manufacturer": "Supermicro",
|
||||||
|
"product_name": "X12DPG-QT6",
|
||||||
|
"serial_number": "21D634101",
|
||||||
|
"part_number": "X12DPG-QT6-REV1.01",
|
||||||
|
"uuid": "d7ef2fe5-2fd0-11f0-910a-346f11040868"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### firmware
|
||||||
|
|
||||||
|
Версии прошивок системных компонентов (BIOS, BMC, CPLD и др.).
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `device_name` | string | **да** | Название устройства (`BIOS`, `BMC`, `CPLD`, …) |
|
||||||
|
| `version` | string | **да** | Версия прошивки |
|
||||||
|
|
||||||
|
Записи с пустым `device_name` или `version` игнорируются.
|
||||||
|
Изменение версии создаёт событие `FIRMWARE_CHANGED` для Asset.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"firmware": [
|
||||||
|
{ "device_name": "BIOS", "version": "06.08.05" },
|
||||||
|
{ "device_name": "BMC", "version": "5.17.00" },
|
||||||
|
{ "device_name": "CPLD", "version": "01.02.03" }
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### cpus
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `socket` | int | **да** | Номер сокета (используется для генерации serial) |
|
||||||
|
| `model` | string | нет | Модель процессора |
|
||||||
|
| `manufacturer` | string | нет | Производитель |
|
||||||
|
| `cores` | int | нет | Количество ядер |
|
||||||
|
| `threads` | int | нет | Количество потоков |
|
||||||
|
| `frequency_mhz` | int | нет | Текущая частота |
|
||||||
|
| `max_frequency_mhz` | int | нет | Максимальная частота |
|
||||||
|
| `temperature_c` | float | нет | Температура CPU, °C (telemetry) |
|
||||||
|
| `power_w` | float | нет | Текущая мощность CPU, Вт (telemetry) |
|
||||||
|
| `throttled` | bool | нет | Зафиксирован thermal/power throttling |
|
||||||
|
| `correctable_error_count` | int | нет | Количество корректируемых ошибок CPU |
|
||||||
|
| `uncorrectable_error_count` | int | нет | Количество некорректируемых ошибок CPU |
|
||||||
|
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||||
|
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||||
|
| `serial_number` | string | нет | Серийный номер (если доступен) |
|
||||||
|
| `firmware` | string | нет | Версия микрокода; если логгер отдает `Microcode level`, передавайте его сюда как есть |
|
||||||
|
| `present` | bool | нет | Наличие (по умолчанию `true`) |
|
||||||
|
| + общие поля статуса | | | см. раздел выше |
|
||||||
|
|
||||||
|
**Генерация serial_number при отсутствии:** `{board_serial}-CPU-{socket}`
|
||||||
|
|
||||||
|
Если источник использует поле/лейбл `Microcode level`, его значение передавайте в `cpus[].firmware` без дополнительного преобразования.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"cpus": [
|
||||||
|
{
|
||||||
|
"socket": 0,
|
||||||
|
"model": "INTEL(R) XEON(R) GOLD 6530",
|
||||||
|
"cores": 32,
|
||||||
|
"threads": 64,
|
||||||
|
"frequency_mhz": 2100,
|
||||||
|
"max_frequency_mhz": 4000,
|
||||||
|
"temperature_c": 61.5,
|
||||||
|
"power_w": 182.0,
|
||||||
|
"throttled": false,
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"status": "OK",
|
||||||
|
"status_checked_at": "2026-02-10T15:28:00Z"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### memory
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `slot` | string | нет | Идентификатор слота |
|
||||||
|
| `present` | bool | нет | Наличие модуля (по умолчанию `true`) |
|
||||||
|
| `serial_number` | string | нет | Серийный номер |
|
||||||
|
| `part_number` | string | нет | Партномер (используется как модель) |
|
||||||
|
| `manufacturer` | string | нет | Производитель |
|
||||||
|
| `size_mb` | int | нет | Объём в МБ |
|
||||||
|
| `type` | string | нет | Тип: `DDR3`, `DDR4`, `DDR5`, … |
|
||||||
|
| `max_speed_mhz` | int | нет | Максимальная частота |
|
||||||
|
| `current_speed_mhz` | int | нет | Текущая частота |
|
||||||
|
| `temperature_c` | float | нет | Температура DIMM/модуля, °C (telemetry) |
|
||||||
|
| `correctable_ecc_error_count` | int | нет | Количество корректируемых ECC-ошибок |
|
||||||
|
| `uncorrectable_ecc_error_count` | int | нет | Количество некорректируемых ECC-ошибок |
|
||||||
|
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||||
|
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||||
|
| `spare_blocks_remaining_pct` | float | нет | Остаток spare blocks, % |
|
||||||
|
| `performance_degraded` | bool | нет | Зафиксирована деградация производительности |
|
||||||
|
| `data_loss_detected` | bool | нет | Источник сигнализирует риск/факт потери данных |
|
||||||
|
| + общие поля статуса | | | см. раздел выше |
|
||||||
|
|
||||||
|
Модуль без `serial_number` игнорируется. Модуль с `present=false` или `status=Empty` игнорируется.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"memory": [
|
||||||
|
{
|
||||||
|
"slot": "CPU0_C0D0",
|
||||||
|
"present": true,
|
||||||
|
"size_mb": 32768,
|
||||||
|
"type": "DDR5",
|
||||||
|
"max_speed_mhz": 4800,
|
||||||
|
"current_speed_mhz": 4800,
|
||||||
|
"temperature_c": 43.0,
|
||||||
|
"correctable_ecc_error_count": 0,
|
||||||
|
"manufacturer": "Hynix",
|
||||||
|
"serial_number": "80AD032419E17CEEC1",
|
||||||
|
"part_number": "HMCG88AGBRA191N",
|
||||||
|
"status": "OK"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### storage
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `slot` | string | нет | Канонический адрес установки PCIe-устройства; передавайте BDF (`0000:18:00.0`) |
|
||||||
|
| `serial_number` | string | нет | Серийный номер |
|
||||||
|
| `model` | string | нет | Модель |
|
||||||
|
| `manufacturer` | string | нет | Производитель |
|
||||||
|
| `type` | string | нет | Тип: `NVMe`, `SSD`, `HDD` |
|
||||||
|
| `interface` | string | нет | Интерфейс: `NVMe`, `SATA`, `SAS` |
|
||||||
|
| `size_gb` | int | нет | Размер в ГБ |
|
||||||
|
| `temperature_c` | float | нет | Температура накопителя, °C (telemetry) |
|
||||||
|
| `power_on_hours` | int64 | нет | Время работы, часы |
|
||||||
|
| `power_cycles` | int64 | нет | Количество циклов питания |
|
||||||
|
| `unsafe_shutdowns` | int64 | нет | Нештатные выключения |
|
||||||
|
| `media_errors` | int64 | нет | Ошибки носителя / media errors |
|
||||||
|
| `error_log_entries` | int64 | нет | Количество записей в error log |
|
||||||
|
| `written_bytes` | int64 | нет | Всего записано байт |
|
||||||
|
| `read_bytes` | int64 | нет | Всего прочитано байт |
|
||||||
|
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||||
|
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||||
|
| `available_spare_pct` | float | нет | Доступный spare, % |
|
||||||
|
| `reallocated_sectors` | int64 | нет | Переназначенные сектора |
|
||||||
|
| `current_pending_sectors` | int64 | нет | Сектора в ожидании ремапа |
|
||||||
|
| `offline_uncorrectable` | int64 | нет | Некорректируемые ошибки offline scan |
|
||||||
|
| `firmware` | string | нет | Версия прошивки |
|
||||||
|
| `present` | bool | нет | Наличие (по умолчанию `true`) |
|
||||||
|
| + общие поля статуса | | | см. раздел выше |
|
||||||
|
|
||||||
|
Диск без `serial_number` игнорируется. Изменение `firmware` создаёт событие `FIRMWARE_CHANGED`.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"storage": [
|
||||||
|
{
|
||||||
|
"slot": "OB01",
|
||||||
|
"type": "NVMe",
|
||||||
|
"model": "INTEL SSDPF2KX076T1",
|
||||||
|
"size_gb": 7680,
|
||||||
|
"temperature_c": 38.5,
|
||||||
|
"power_on_hours": 12450,
|
||||||
|
"unsafe_shutdowns": 3,
|
||||||
|
"written_bytes": 9876543210,
|
||||||
|
"life_remaining_pct": 91.0,
|
||||||
|
"serial_number": "BTAX41900GF87P6DGN",
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"firmware": "9CV10510",
|
||||||
|
"interface": "NVMe",
|
||||||
|
"present": true,
|
||||||
|
"status": "OK"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### pcie_devices
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `slot` | string | нет | Идентификатор слота |
|
||||||
|
| `vendor_id` | int | нет | PCI Vendor ID (decimal) |
|
||||||
|
| `device_id` | int | нет | PCI Device ID (decimal) |
|
||||||
|
| `numa_node` | int | нет | NUMA node / CPU affinity устройства |
|
||||||
|
| `temperature_c` | float | нет | Температура устройства, °C (telemetry) |
|
||||||
|
| `power_w` | float | нет | Текущее энергопотребление устройства, Вт (telemetry) |
|
||||||
|
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||||
|
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||||
|
| `ecc_corrected_total` | int64 | нет | Всего корректируемых ECC-ошибок |
|
||||||
|
| `ecc_uncorrected_total` | int64 | нет | Всего некорректируемых ECC-ошибок |
|
||||||
|
| `hw_slowdown` | bool | нет | Устройство вошло в hardware slowdown / protective mode |
|
||||||
|
| `battery_charge_pct` | float | нет | Заряд батареи / supercap, % |
|
||||||
|
| `battery_health_pct` | float | нет | Состояние батареи / supercap, % |
|
||||||
|
| `battery_temperature_c` | float | нет | Температура батареи / supercap, °C |
|
||||||
|
| `battery_voltage_v` | float | нет | Напряжение батареи / supercap, В |
|
||||||
|
| `battery_replace_required` | bool | нет | Требуется замена батареи / supercap |
|
||||||
|
| `sfp_temperature_c` | float | нет | Температура SFP/optic, °C |
|
||||||
|
| `sfp_tx_power_dbm` | float | нет | TX optical power, dBm |
|
||||||
|
| `sfp_rx_power_dbm` | float | нет | RX optical power, dBm |
|
||||||
|
| `sfp_voltage_v` | float | нет | Напряжение SFP, В |
|
||||||
|
| `sfp_bias_ma` | float | нет | Bias current SFP, мА |
|
||||||
|
| `bdf` | string | нет | Deprecated alias для `slot`; при наличии ingest нормализует его в `slot` |
|
||||||
|
| `device_class` | string | нет | Класс устройства (см. список ниже) |
|
||||||
|
| `manufacturer` | string | нет | Производитель |
|
||||||
|
| `model` | string | нет | Модель |
|
||||||
|
| `serial_number` | string | нет | Серийный номер |
|
||||||
|
| `firmware` | string | нет | Версия прошивки |
|
||||||
|
| `link_width` | int | нет | Текущая ширина линка |
|
||||||
|
| `link_speed` | string | нет | Текущая скорость: `Gen3`, `Gen4`, `Gen5` |
|
||||||
|
| `max_link_width` | int | нет | Максимальная ширина линка |
|
||||||
|
| `max_link_speed` | string | нет | Максимальная скорость |
|
||||||
|
| `mac_addresses` | string[] | нет | MAC-адреса портов (для сетевых устройств) |
|
||||||
|
| `present` | bool | нет | Наличие (по умолчанию `true`) |
|
||||||
|
| + общие поля статуса | | | см. раздел выше |
|
||||||
|
|
||||||
|
`numa_node` передавайте для NIC / InfiniBand / RAID / GPU, когда источник знает CPU/NUMA affinity. Поле сохраняется в snapshot-атрибутах PCIe-компонента и дублируется в telemetry для topology use cases.
|
||||||
|
Поля `temperature_c` и `power_w` используйте для device-level telemetry GPU / accelerator / smart PCIe devices. Они не влияют на идентификацию компонента.
|
||||||
|
|
||||||
|
**Генерация serial_number при отсутствии или `"N/A"`:** `{board_serial}-PCIE-{slot}`, где `slot` для PCIe равен BDF.
|
||||||
|
|
||||||
|
`slot` — единственный канонический адрес компонента. Для PCIe в `slot` передавайте BDF. Поле `bdf` сохраняется только как переходный alias на входе и не должно использоваться как отдельная координата рядом со `slot`.
|
||||||
|
|
||||||
|
**Значения `device_class`:**
|
||||||
|
|
||||||
|
| Значение | Назначение |
|
||||||
|
|----------|------------|
|
||||||
|
| `MassStorageController` | RAID-контроллеры |
|
||||||
|
| `StorageController` | HBA, SAS-контроллеры |
|
||||||
|
| `NetworkController` | Сетевые адаптеры (InfiniBand, общий) |
|
||||||
|
| `EthernetController` | Ethernet NIC |
|
||||||
|
| `FibreChannelController` | Fibre Channel HBA |
|
||||||
|
| `VideoController` | GPU, видеокарты |
|
||||||
|
| `ProcessingAccelerator` | Вычислительные ускорители (AI/ML) |
|
||||||
|
| `DisplayController` | Контроллеры дисплея (BMC VGA) |
|
||||||
|
|
||||||
|
Список открытый: допускаются произвольные строки для нестандартных классов.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"pcie_devices": [
|
||||||
|
{
|
||||||
|
"slot": "0000:3b:00.0",
|
||||||
|
"vendor_id": 5555,
|
||||||
|
"device_id": 4401,
|
||||||
|
"numa_node": 0,
|
||||||
|
"temperature_c": 48.5,
|
||||||
|
"power_w": 18.2,
|
||||||
|
"sfp_temperature_c": 36.2,
|
||||||
|
"sfp_tx_power_dbm": -1.8,
|
||||||
|
"sfp_rx_power_dbm": -2.1,
|
||||||
|
"device_class": "EthernetController",
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"model": "X710 10GbE",
|
||||||
|
"serial_number": "K65472-003",
|
||||||
|
"firmware": "9.20 0x8000d4ae",
|
||||||
|
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
|
||||||
|
"status": "OK"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### power_supplies
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `slot` | string | нет | Идентификатор слота |
|
||||||
|
| `present` | bool | нет | Наличие (по умолчанию `true`) |
|
||||||
|
| `serial_number` | string | нет | Серийный номер |
|
||||||
|
| `part_number` | string | нет | Партномер |
|
||||||
|
| `model` | string | нет | Модель |
|
||||||
|
| `vendor` | string | нет | Производитель |
|
||||||
|
| `wattage_w` | int | нет | Мощность в ваттах |
|
||||||
|
| `firmware` | string | нет | Версия прошивки |
|
||||||
|
| `input_type` | string | нет | Тип входа (например `ACWideRange`) |
|
||||||
|
| `input_voltage` | float | нет | Входное напряжение, В (telemetry) |
|
||||||
|
| `input_power_w` | float | нет | Входная мощность, Вт (telemetry) |
|
||||||
|
| `output_power_w` | float | нет | Выходная мощность, Вт (telemetry) |
|
||||||
|
| `temperature_c` | float | нет | Температура PSU, °C (telemetry) |
|
||||||
|
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||||
|
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||||
|
| + общие поля статуса | | | см. раздел выше |
|
||||||
|
|
||||||
|
Поля telemetry (`input_voltage`, `input_power_w`, `output_power_w`, `temperature_c`, `life_remaining_pct`, `life_used_pct`) сохраняются в атрибутах компонента и не влияют на его идентификацию.
|
||||||
|
|
||||||
|
PSU без `serial_number` игнорируется.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"power_supplies": [
|
||||||
|
{
|
||||||
|
"slot": "0",
|
||||||
|
"present": true,
|
||||||
|
"model": "GW-CRPS3000LW",
|
||||||
|
"vendor": "Great Wall",
|
||||||
|
"wattage_w": 3000,
|
||||||
|
"serial_number": "2P06C102610",
|
||||||
|
"firmware": "00.03.05",
|
||||||
|
"status": "OK",
|
||||||
|
"input_type": "ACWideRange",
|
||||||
|
"input_power_w": 137,
|
||||||
|
"output_power_w": 104,
|
||||||
|
"input_voltage": 215.25,
|
||||||
|
"temperature_c": 39.5,
|
||||||
|
"life_remaining_pct": 97.0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### sensors
|
||||||
|
|
||||||
|
Показания сенсоров сервера. Секция опциональная, не привязана к компонентам.
|
||||||
|
Данные хранятся как последнее известное значение (last-known-value) на уровне Asset.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"sensors": {
|
||||||
|
"fans": [ ... ],
|
||||||
|
"power": [ ... ],
|
||||||
|
"temperatures": [ ... ],
|
||||||
|
"other": [ ... ]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### event_logs
|
||||||
|
|
||||||
|
Нормализованные операционные логи сервера из `host`, `bmc` или `redfish`.
|
||||||
|
|
||||||
|
Эти записи не попадают в history timeline и не создают history events. Они сохраняются в отдельной deduplicated log store и отображаются в отдельном UI-блоке asset logs / host logs.
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `source` | string | **да** | Источник лога: `host`, `bmc`, `redfish` |
|
||||||
|
| `event_time` | string RFC3339 | нет | Время события из источника; если отсутствует, используется время ingest/collection |
|
||||||
|
| `severity` | string | нет | Уровень: `OK`, `Info`, `Warning`, `Critical`, `Unknown` |
|
||||||
|
| `message_id` | string | нет | Идентификатор/код события источника |
|
||||||
|
| `message` | string | **да** | Нормализованный текст события |
|
||||||
|
| `component_ref` | string | нет | Ссылка на компонент/устройство/слот, если извлекается |
|
||||||
|
| `fingerprint` | string | нет | Внешний готовый dedup-key; если не передан, система вычисляет свой |
|
||||||
|
| `is_active` | bool | нет | Признак, что событие всё ещё активно/не погашено, если источник умеет lifecycle |
|
||||||
|
| `raw_payload` | object | нет | Сырой vendor-specific payload для диагностики |
|
||||||
|
|
||||||
|
**Правила event_logs:**
|
||||||
|
- Логи дедуплицируются в рамках asset + source + fingerprint.
|
||||||
|
- Если `fingerprint` не передан, система строит его из нормализованных полей (`source`, `message_id`, `message`, `component_ref`, временная нормализация).
|
||||||
|
- Интегратор/сборщик логов не должен синтезировать содержимое событий: не придумывайте `message`, `message_id`, `component_ref`, serial/device identifiers или иные поля, если они отсутствуют в исходном логе или не были надёжно извлечены.
|
||||||
|
- Повторное получение того же события обновляет `last_seen_at`/счётчик повторов и не должно создавать новый timeline/history event.
|
||||||
|
- `event_logs` используются для отдельного UI-представления логов и не изменяют canonical state компонентов/asset по умолчанию.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"event_logs": [
|
||||||
|
{
|
||||||
|
"source": "bmc",
|
||||||
|
"event_time": "2026-03-15T14:03:11Z",
|
||||||
|
"severity": "Warning",
|
||||||
|
"message_id": "0x000F",
|
||||||
|
"message": "Correctable ECC error threshold exceeded",
|
||||||
|
"component_ref": "CPU0_C0D0",
|
||||||
|
"raw_payload": {
|
||||||
|
"sensor": "DIMM_A1",
|
||||||
|
"sel_record_id": "0042"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": "redfish",
|
||||||
|
"event_time": "2026-03-15T14:03:20Z",
|
||||||
|
"severity": "Info",
|
||||||
|
"message_id": "OpenBMC.0.1.SystemReboot",
|
||||||
|
"message": "System reboot requested by administrator",
|
||||||
|
"component_ref": "Mainboard"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### sensors.fans
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `name` | string | **да** | Уникальное имя сенсора в рамках секции |
|
||||||
|
| `location` | string | нет | Физическое расположение |
|
||||||
|
| `rpm` | int | нет | Обороты, RPM |
|
||||||
|
| `status` | string | нет | Статус: `OK`, `Warning`, `Critical`, `Unknown` |
|
||||||
|
|
||||||
|
#### sensors.power
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `name` | string | **да** | Уникальное имя сенсора |
|
||||||
|
| `location` | string | нет | Физическое расположение |
|
||||||
|
| `voltage_v` | float | нет | Напряжение, В |
|
||||||
|
| `current_a` | float | нет | Ток, А |
|
||||||
|
| `power_w` | float | нет | Мощность, Вт |
|
||||||
|
| `status` | string | нет | Статус |
|
||||||
|
|
||||||
|
#### sensors.temperatures
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `name` | string | **да** | Уникальное имя сенсора |
|
||||||
|
| `location` | string | нет | Физическое расположение |
|
||||||
|
| `celsius` | float | нет | Температура, °C |
|
||||||
|
| `threshold_warning_celsius` | float | нет | Порог Warning, °C |
|
||||||
|
| `threshold_critical_celsius` | float | нет | Порог Critical, °C |
|
||||||
|
| `status` | string | нет | Статус |
|
||||||
|
|
||||||
|
#### sensors.other
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `name` | string | **да** | Уникальное имя сенсора |
|
||||||
|
| `location` | string | нет | Физическое расположение |
|
||||||
|
| `value` | float | нет | Значение |
|
||||||
|
| `unit` | string | нет | Единица измерения |
|
||||||
|
| `status` | string | нет | Статус |
|
||||||
|
|
||||||
|
**Правила sensors:**
|
||||||
|
- Идентификатор сенсора: пара `(sensor_type, name)`. Дубли в одном payload — берётся первое вхождение.
|
||||||
|
- Сенсоры без `name` игнорируются.
|
||||||
|
- При каждом импорте значения перезаписываются (upsert по ключу).
|
||||||
|
|
||||||
|
```json
|
||||||
|
"sensors": {
|
||||||
|
"fans": [
|
||||||
|
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" },
|
||||||
|
{ "name": "FAN_CPU0", "location": "CPU0", "rpm": 5600, "status": "OK" }
|
||||||
|
],
|
||||||
|
"power": [
|
||||||
|
{ "name": "12V Rail", "location": "Mainboard", "voltage_v": 12.06, "status": "OK" },
|
||||||
|
{ "name": "PSU0 Input", "location": "PSU0", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
|
||||||
|
],
|
||||||
|
"temperatures": [
|
||||||
|
{ "name": "CPU0 Temp", "location": "CPU0", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
|
||||||
|
{ "name": "Inlet Temp", "location": "Front", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
|
||||||
|
],
|
||||||
|
"other": [
|
||||||
|
{ "name": "System Humidity", "value": 38.5, "unit": "%", "status": "OK" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Обработка статусов компонентов
|
||||||
|
|
||||||
|
| Статус | Поведение |
|
||||||
|
|--------|-----------|
|
||||||
|
| `OK` | Нормальная обработка |
|
||||||
|
| `Warning` | Создаётся событие `COMPONENT_WARNING` |
|
||||||
|
| `Critical` | Создаётся событие `COMPONENT_FAILED` + запись в `failure_events` |
|
||||||
|
| `Unknown` | Компонент считается рабочим, создаётся событие `COMPONENT_UNKNOWN` |
|
||||||
|
| `Empty` | Компонент не создаётся/не обновляется |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Обработка отсутствующих serial_number
|
||||||
|
|
||||||
|
Общее правило для всех секций: если источник не вернул серийный номер и сборщик не смог его надёжно извлечь, интегратор не должен подставлять вымышленные значения, хеши, локальные placeholder-идентификаторы или серийные номера "по догадке". Разрешены только явно оговорённые ниже server-side fallback-правила ingest.
|
||||||
|
|
||||||
|
| Тип | Поведение |
|
||||||
|
|-----|-----------|
|
||||||
|
| CPU | Генерируется: `{board_serial}-CPU-{socket}` |
|
||||||
|
| PCIe | Генерируется: `{board_serial}-PCIE-{slot}` (если serial = `"N/A"` или пустой; `slot` для PCIe = BDF) |
|
||||||
|
| Memory | Компонент игнорируется |
|
||||||
|
| Storage | Компонент игнорируется |
|
||||||
|
| PSU | Компонент игнорируется |
|
||||||
|
|
||||||
|
Если `serial_number` не уникален внутри одного payload для того же `model`:
|
||||||
|
- Первое вхождение сохраняет оригинальный серийный номер.
|
||||||
|
- Каждое следующее дублирующее получает placeholder: `NO_SN-XXXXXXXX`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Минимальный валидный пример
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"collected_at": "2026-02-10T15:30:00Z",
|
||||||
|
"target_host": "192.168.1.100",
|
||||||
|
"hardware": {
|
||||||
|
"board": {
|
||||||
|
"serial_number": "SRV-001"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Полный пример с историей статусов
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"filename": "redfish://10.10.10.103",
|
||||||
|
"source_type": "api",
|
||||||
|
"protocol": "redfish",
|
||||||
|
"target_host": "10.10.10.103",
|
||||||
|
"collected_at": "2026-02-10T15:30:00Z",
|
||||||
|
"hardware": {
|
||||||
|
"board": {
|
||||||
|
"manufacturer": "Supermicro",
|
||||||
|
"product_name": "X12DPG-QT6",
|
||||||
|
"serial_number": "21D634101"
|
||||||
|
},
|
||||||
|
"firmware": [
|
||||||
|
{ "device_name": "BIOS", "version": "06.08.05" },
|
||||||
|
{ "device_name": "BMC", "version": "5.17.00" }
|
||||||
|
],
|
||||||
|
"cpus": [
|
||||||
|
{
|
||||||
|
"socket": 0,
|
||||||
|
"model": "INTEL(R) XEON(R) GOLD 6530",
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"cores": 32,
|
||||||
|
"threads": 64,
|
||||||
|
"status": "OK"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"storage": [
|
||||||
|
{
|
||||||
|
"slot": "OB01",
|
||||||
|
"type": "NVMe",
|
||||||
|
"model": "INTEL SSDPF2KX076T1",
|
||||||
|
"size_gb": 7680,
|
||||||
|
"serial_number": "BTAX41900GF87P6DGN",
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"firmware": "9CV10510",
|
||||||
|
"present": true,
|
||||||
|
"status": "OK",
|
||||||
|
"status_changed_at": "2026-02-10T15:22:00Z",
|
||||||
|
"status_history": [
|
||||||
|
{ "status": "Critical", "changed_at": "2026-02-10T15:10:00Z", "details": "I/O timeout on NVMe queue 3" },
|
||||||
|
{ "status": "OK", "changed_at": "2026-02-10T15:22:00Z", "details": "Recovered after controller reset" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"pcie_devices": [
|
||||||
|
{
|
||||||
|
"slot": "0000:18:00.0",
|
||||||
|
"device_class": "EthernetController",
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"model": "X710 10GbE",
|
||||||
|
"serial_number": "K65472-003",
|
||||||
|
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
|
||||||
|
"status": "OK"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"power_supplies": [
|
||||||
|
{
|
||||||
|
"slot": "0",
|
||||||
|
"present": true,
|
||||||
|
"model": "GW-CRPS3000LW",
|
||||||
|
"vendor": "Great Wall",
|
||||||
|
"wattage_w": 3000,
|
||||||
|
"serial_number": "2P06C102610",
|
||||||
|
"firmware": "00.03.05",
|
||||||
|
"status": "OK",
|
||||||
|
"input_power_w": 137,
|
||||||
|
"output_power_w": 104,
|
||||||
|
"input_voltage": 215.25
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"sensors": {
|
||||||
|
"fans": [
|
||||||
|
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" }
|
||||||
|
],
|
||||||
|
"power": [
|
||||||
|
{ "name": "12V Rail", "voltage_v": 12.06, "status": "OK" }
|
||||||
|
],
|
||||||
|
"temperatures": [
|
||||||
|
{ "name": "CPU0 Temp", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" }
|
||||||
|
],
|
||||||
|
"other": [
|
||||||
|
{ "name": "System Humidity", "value": 38.5, "unit": "%" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
1
internal/chart
Submodule
1
internal/chart
Submodule
Submodule internal/chart added at 05db6994d4
58
iso/README.md
Normal file
58
iso/README.md
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
# ISO Build
|
||||||
|
|
||||||
|
`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Docker Desktop or another Docker-compatible container runtime
|
||||||
|
- Privileged containers enabled
|
||||||
|
- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages
|
||||||
|
|
||||||
|
## Build On macOS
|
||||||
|
|
||||||
|
From the repository root:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sh iso/builder/build-in-container.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
The script defaults to `linux/amd64` builder containers, so it works on:
|
||||||
|
|
||||||
|
- Intel Mac
|
||||||
|
- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM
|
||||||
|
|
||||||
|
You do not need to pass `--platform` manually for normal ISO builds.
|
||||||
|
|
||||||
|
## Useful Options
|
||||||
|
|
||||||
|
Build with explicit SSH keys baked into the ISO:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
|
||||||
|
```
|
||||||
|
|
||||||
|
Rebuild the builder image:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sh iso/builder/build-in-container.sh --rebuild-image
|
||||||
|
```
|
||||||
|
|
||||||
|
Use a custom cache directory:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
||||||
|
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
||||||
|
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
||||||
|
- Override the container platform only if you know why:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
- The shipped ISO is still `amd64`.
|
||||||
|
- Output ISO artifacts are written under `dist/`.
|
||||||
57
iso/builder/Dockerfile
Normal file
57
iso/builder/Dockerfile
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
FROM debian:12
|
||||||
|
|
||||||
|
ARG GO_VERSION=1.24.0
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN apt-get update -qq && apt-get install -y \
|
||||||
|
ca-certificates \
|
||||||
|
live-build \
|
||||||
|
debootstrap \
|
||||||
|
squashfs-tools \
|
||||||
|
xorriso \
|
||||||
|
grub-pc-bin \
|
||||||
|
grub-efi-amd64-bin \
|
||||||
|
mtools \
|
||||||
|
git \
|
||||||
|
wget \
|
||||||
|
curl \
|
||||||
|
tar \
|
||||||
|
xz-utils \
|
||||||
|
rsync \
|
||||||
|
build-essential \
|
||||||
|
gcc \
|
||||||
|
make \
|
||||||
|
perl \
|
||||||
|
linux-headers-amd64 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Add NVIDIA CUDA repo and install nvcc (needed to compile nccl-tests)
|
||||||
|
RUN wget -qO /tmp/cuda-keyring.gpg \
|
||||||
|
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub \
|
||||||
|
&& gpg --dearmor < /tmp/cuda-keyring.gpg \
|
||||||
|
> /usr/share/keyrings/nvidia-cuda.gpg \
|
||||||
|
&& rm /tmp/cuda-keyring.gpg \
|
||||||
|
&& echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] \
|
||||||
|
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
|
||||||
|
> /etc/apt/sources.list.d/cuda.list \
|
||||||
|
&& apt-get update -qq \
|
||||||
|
&& apt-get install -y cuda-nvcc-12-8 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& ln -sfn /usr/local/cuda-12.8 /usr/local/cuda
|
||||||
|
|
||||||
|
RUN arch="$(dpkg --print-architecture)" \
|
||||||
|
&& case "$arch" in \
|
||||||
|
amd64) goarch=amd64 ;; \
|
||||||
|
arm64) goarch=arm64 ;; \
|
||||||
|
*) echo "unsupported architecture: $arch" >&2; exit 1 ;; \
|
||||||
|
esac \
|
||||||
|
&& wget -q -O /tmp/go.tar.gz "https://go.dev/dl/go${GO_VERSION}.linux-${goarch}.tar.gz" \
|
||||||
|
&& rm -rf /usr/local/go \
|
||||||
|
&& tar -C /usr/local -xzf /tmp/go.tar.gz \
|
||||||
|
&& rm -f /tmp/go.tar.gz
|
||||||
|
|
||||||
|
ENV PATH=/usr/local/go/bin:${PATH}
|
||||||
|
WORKDIR /work
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
@@ -1,4 +1,12 @@
|
|||||||
ALPINE_VERSION=3.21
|
DEBIAN_VERSION=12
|
||||||
|
DEBIAN_KERNEL_ABI=auto
|
||||||
NVIDIA_DRIVER_VERSION=590.48.01
|
NVIDIA_DRIVER_VERSION=590.48.01
|
||||||
GO_VERSION=1.23.6
|
NCCL_VERSION=2.28.9-1
|
||||||
AUDIT_VERSION=0.1.0
|
NCCL_CUDA_VERSION=13.0
|
||||||
|
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||||
|
NCCL_TESTS_VERSION=2.13.10
|
||||||
|
NVCC_VERSION=12.8
|
||||||
|
CUBLAS_VERSION=13.0.2.14-1
|
||||||
|
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||||
|
GO_VERSION=1.24.0
|
||||||
|
AUDIT_VERSION=1.0.0
|
||||||
|
|||||||
5
iso/builder/auto/build
Executable file
5
iso/builder/auto/build
Executable file
@@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# auto/build — live-build build wrapper for bee ISO
|
||||||
|
set -e
|
||||||
|
|
||||||
|
lb build noauto "${@}" 2>&1
|
||||||
37
iso/builder/auto/config
Executable file
37
iso/builder/auto/config
Executable file
@@ -0,0 +1,37 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# auto/config — live-build configuration for bee ISO
|
||||||
|
# Runs automatically when lb config is called.
|
||||||
|
# See: man lb_config
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
. "$(dirname "$0")/../VERSIONS"
|
||||||
|
|
||||||
|
# Pin the exact kernel ABI detected by build.sh so the ISO kernel matches
|
||||||
|
# the kernel headers used to compile NVIDIA modules. Falls back to meta-package
|
||||||
|
# when lb config is run manually without the environment variable.
|
||||||
|
if [ -n "${BEE_KERNEL_ABI:-}" ] && [ "${BEE_KERNEL_ABI}" != "auto" ]; then
|
||||||
|
LB_LINUX_PACKAGES="linux-image-${BEE_KERNEL_ABI}"
|
||||||
|
else
|
||||||
|
LB_LINUX_PACKAGES="linux-image"
|
||||||
|
fi
|
||||||
|
|
||||||
|
lb config noauto \
|
||||||
|
--distribution bookworm \
|
||||||
|
--architectures amd64 \
|
||||||
|
--binary-images iso-hybrid \
|
||||||
|
--bootloaders "grub-efi,syslinux" \
|
||||||
|
--debian-installer none \
|
||||||
|
--archive-areas "main contrib non-free non-free-firmware" \
|
||||||
|
--mirror-bootstrap "https://deb.debian.org/debian" \
|
||||||
|
--mirror-chroot "https://deb.debian.org/debian" \
|
||||||
|
--mirror-binary "https://deb.debian.org/debian" \
|
||||||
|
--security true \
|
||||||
|
--linux-flavours "amd64" \
|
||||||
|
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||||
|
--memtest none \
|
||||||
|
--iso-volume "EASY-BEE" \
|
||||||
|
--iso-application "EASY-BEE" \
|
||||||
|
--bootappend-live "boot=live components quiet nomodeset video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||||
|
--apt-recommends false \
|
||||||
|
"${@}"
|
||||||
1176
iso/builder/bee-gpu-stress.c
Normal file
1176
iso/builder/bee-gpu-stress.c
Normal file
File diff suppressed because it is too large
Load Diff
190
iso/builder/build-cublas.sh
Normal file
190
iso/builder/build-cublas.sh
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
|
||||||
|
#
|
||||||
|
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
||||||
|
# verifies them against Packages.gz, and extracts the small subset we need:
|
||||||
|
# - headers for compiling bee-gpu-stress against cuBLASLt
|
||||||
|
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CUBLAS_VERSION="$1"
|
||||||
|
CUDA_USERSPACE_VERSION="$2"
|
||||||
|
CUDA_SERIES="$3"
|
||||||
|
DIST_DIR="$4"
|
||||||
|
|
||||||
|
[ -n "$CUBLAS_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$CUDA_SERIES" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||||
|
|
||||||
|
CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-')
|
||||||
|
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||||
|
CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}"
|
||||||
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads"
|
||||||
|
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
|
||||||
|
|
||||||
|
echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ==="
|
||||||
|
|
||||||
|
if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \
|
||||||
|
&& [ -f "${CACHE_DIR}/include/crt/host_defines.h" ] \
|
||||||
|
&& [ -f "${CACHE_DIR}/include/nv/target" ] \
|
||||||
|
&& [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||||
|
echo "=== cuBLAS cached, skipping download ==="
|
||||||
|
echo "cache: $CACHE_DIR"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib"
|
||||||
|
|
||||||
|
echo "=== downloading Packages.gz ==="
|
||||||
|
wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
|
||||||
|
|
||||||
|
lookup_pkg() {
|
||||||
|
pkg="$1"
|
||||||
|
ver="$2" # if empty, match any version (first found)
|
||||||
|
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
|
||||||
|
/^Package: / { cur_pkg=$2; gsub(/\r/, "", cur_pkg) }
|
||||||
|
/^Version: / { cur_ver=$2; gsub(/\r/, "", cur_ver) }
|
||||||
|
/^Filename: / { cur_file=$2; gsub(/\r/, "", cur_file) }
|
||||||
|
/^SHA256: / { cur_sha=$2; gsub(/\r/, "", cur_sha) }
|
||||||
|
/^$/ {
|
||||||
|
if (cur_pkg == pkg && (ver == "" || cur_ver == ver)) {
|
||||||
|
print cur_file " " cur_sha
|
||||||
|
printed=1
|
||||||
|
exit
|
||||||
|
}
|
||||||
|
cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
|
||||||
|
}
|
||||||
|
END {
|
||||||
|
if (!printed && cur_pkg == pkg && (ver == "" || cur_ver == ver)) {
|
||||||
|
print cur_file " " cur_sha
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
}
|
||||||
|
|
||||||
|
download_verified_pkg() {
|
||||||
|
pkg="$1"
|
||||||
|
ver="$2"
|
||||||
|
|
||||||
|
meta="$(lookup_pkg "$pkg" "$ver")"
|
||||||
|
[ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; }
|
||||||
|
|
||||||
|
repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
|
||||||
|
repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
|
||||||
|
[ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; }
|
||||||
|
[ -n "$repo_sha" ] || { echo "ERROR: package sha missing for ${pkg}"; exit 1; }
|
||||||
|
|
||||||
|
out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")"
|
||||||
|
if [ -f "$out" ]; then
|
||||||
|
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
|
||||||
|
if [ "$actual_sha" = "$repo_sha" ]; then
|
||||||
|
echo "=== using cached $(basename "$repo_file") ===" >&2
|
||||||
|
printf '%s\n' "$out"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ===" >&2
|
||||||
|
rm -f "$out"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== downloading $(basename "$repo_file") ===" >&2
|
||||||
|
wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")"
|
||||||
|
|
||||||
|
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
|
||||||
|
if [ "$actual_sha" != "$repo_sha" ]; then
|
||||||
|
echo "ERROR: sha256 mismatch for $(basename "$repo_file")" >&2
|
||||||
|
echo " expected: $repo_sha" >&2
|
||||||
|
echo " actual: $actual_sha" >&2
|
||||||
|
rm -f "$out"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "sha256 OK: $(basename "$repo_file")" >&2
|
||||||
|
printf '%s\n' "$out"
|
||||||
|
}
|
||||||
|
|
||||||
|
extract_deb() {
|
||||||
|
deb="$1"
|
||||||
|
dst="$2"
|
||||||
|
mkdir -p "$dst"
|
||||||
|
(
|
||||||
|
cd "$dst"
|
||||||
|
ar x "$deb"
|
||||||
|
data_tar=$(ls data.tar.* 2>/dev/null | head -1)
|
||||||
|
[ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; }
|
||||||
|
tar xf "$data_tar"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
copy_headers() {
|
||||||
|
from="$1"
|
||||||
|
if [ -d "${from}/usr/include" ]; then
|
||||||
|
cp -a "${from}/usr/include/." "${CACHE_DIR}/include/"
|
||||||
|
fi
|
||||||
|
# NVIDIA CUDA packages install headers under /usr/local/cuda-X.Y/targets/x86_64-linux/include/
|
||||||
|
find "$from" -type d -name include | while read -r inc_dir; do
|
||||||
|
case "$inc_dir" in
|
||||||
|
*/usr/include) ;; # already handled above
|
||||||
|
*)
|
||||||
|
if find "${inc_dir}" -maxdepth 3 \( -name '*.h' -o -type f \) | grep -q .; then
|
||||||
|
cp -a "${inc_dir}/." "${CACHE_DIR}/include/"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
copy_libs() {
|
||||||
|
from="$1"
|
||||||
|
find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \
|
||||||
|
\( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||||
|
}
|
||||||
|
|
||||||
|
make_links() {
|
||||||
|
base="$1"
|
||||||
|
versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1)
|
||||||
|
[ -n "$versioned" ] || return 0
|
||||||
|
soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#")
|
||||||
|
target=$(basename "$versioned")
|
||||||
|
ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true
|
||||||
|
ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
|
TMP_DIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "$TMP_DIR"' EXIT INT TERM
|
||||||
|
|
||||||
|
CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
|
||||||
|
CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
|
||||||
|
CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
|
||||||
|
CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
|
||||||
|
CUDA_CRT_DEB=$(download_verified_pkg "cuda-crt-${CUDA_SERIES_DASH}" "")
|
||||||
|
CUDA_CCCL_DEB=$(download_verified_pkg "cuda-cccl-${CUDA_SERIES_DASH}" "")
|
||||||
|
|
||||||
|
extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt"
|
||||||
|
extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev"
|
||||||
|
extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt"
|
||||||
|
extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev"
|
||||||
|
extract_deb "$CUDA_CRT_DEB" "${TMP_DIR}/cuda-crt"
|
||||||
|
extract_deb "$CUDA_CCCL_DEB" "${TMP_DIR}/cuda-cccl"
|
||||||
|
|
||||||
|
copy_headers "${TMP_DIR}/cublas-dev"
|
||||||
|
copy_headers "${TMP_DIR}/cudart-dev"
|
||||||
|
copy_headers "${TMP_DIR}/cuda-crt"
|
||||||
|
copy_headers "${TMP_DIR}/cuda-cccl"
|
||||||
|
copy_libs "${TMP_DIR}/cublas-rt"
|
||||||
|
copy_libs "${TMP_DIR}/cudart-rt"
|
||||||
|
|
||||||
|
make_links "libcublas"
|
||||||
|
make_links "libcublasLt"
|
||||||
|
make_links "libcudart"
|
||||||
|
|
||||||
|
[ -f "${CACHE_DIR}/include/cublasLt.h" ] || { echo "ERROR: cublasLt.h not extracted"; exit 1; }
|
||||||
|
[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; }
|
||||||
|
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; }
|
||||||
|
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; }
|
||||||
|
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; }
|
||||||
|
|
||||||
|
echo "=== cuBLAS extraction complete ==="
|
||||||
|
echo "cache: $CACHE_DIR"
|
||||||
|
echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)"
|
||||||
|
echo "libs: $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)"
|
||||||
123
iso/builder/build-in-container.sh
Executable file
123
iso/builder/build-in-container.sh
Executable file
@@ -0,0 +1,123 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# build-in-container.sh — build the bee ISO inside the Debian builder container.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||||
|
BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
||||||
|
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
|
||||||
|
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
||||||
|
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||||
|
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||||
|
AUTH_KEYS=""
|
||||||
|
REBUILD_IMAGE=0
|
||||||
|
|
||||||
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
|
|
||||||
|
while [ $# -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--cache-dir)
|
||||||
|
CACHE_DIR="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--rebuild-image)
|
||||||
|
REBUILD_IMAGE=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--authorized-keys)
|
||||||
|
AUTH_KEYS="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "unknown arg: $1" >&2
|
||||||
|
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--authorized-keys /path/to/authorized_keys]" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then
|
||||||
|
echo "container tool not found: $CONTAINER_TOOL" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
PLATFORM_OS="${BUILDER_PLATFORM%/*}"
|
||||||
|
PLATFORM_ARCH="${BUILDER_PLATFORM#*/}"
|
||||||
|
if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then
|
||||||
|
echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$AUTH_KEYS" ]; then
|
||||||
|
[ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; }
|
||||||
|
AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")"
|
||||||
|
AUTH_KEYS_DIR="$(dirname "$AUTH_KEYS_ABS")"
|
||||||
|
AUTH_KEYS_BASE="$(basename "$AUTH_KEYS_ABS")"
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p \
|
||||||
|
"${CACHE_DIR}" \
|
||||||
|
"${CACHE_DIR}/go-build" \
|
||||||
|
"${CACHE_DIR}/go-mod" \
|
||||||
|
"${CACHE_DIR}/tmp" \
|
||||||
|
"${CACHE_DIR}/bee"
|
||||||
|
|
||||||
|
IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}"
|
||||||
|
|
||||||
|
image_matches_platform() {
|
||||||
|
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)"
|
||||||
|
[ "$actual_platform" = "${BUILDER_PLATFORM}" ]
|
||||||
|
}
|
||||||
|
|
||||||
|
NEED_BUILD_IMAGE=0
|
||||||
|
if [ "$REBUILD_IMAGE" = "1" ]; then
|
||||||
|
NEED_BUILD_IMAGE=1
|
||||||
|
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||||
|
NEED_BUILD_IMAGE=1
|
||||||
|
elif ! image_matches_platform; then
|
||||||
|
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)"
|
||||||
|
echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ==="
|
||||||
|
NEED_BUILD_IMAGE=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$NEED_BUILD_IMAGE" = "1" ]; then
|
||||||
|
"$CONTAINER_TOOL" build \
|
||||||
|
--platform "${BUILDER_PLATFORM}" \
|
||||||
|
--build-arg GO_VERSION="${GO_VERSION}" \
|
||||||
|
-t "${IMAGE_REF}" \
|
||||||
|
"${BUILDER_DIR}"
|
||||||
|
else
|
||||||
|
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
set -- \
|
||||||
|
run --rm --privileged \
|
||||||
|
--platform "${BUILDER_PLATFORM}" \
|
||||||
|
-v "${REPO_ROOT}:/work" \
|
||||||
|
-v "${CACHE_DIR}:/cache" \
|
||||||
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
|
-e GOCACHE=/cache/go-build \
|
||||||
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
|
-e TMPDIR=/cache/tmp \
|
||||||
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-w /work \
|
||||||
|
"${IMAGE_REF}" \
|
||||||
|
sh /work/iso/builder/build.sh
|
||||||
|
|
||||||
|
if [ -n "$AUTH_KEYS" ]; then
|
||||||
|
set -- run --rm --privileged \
|
||||||
|
--platform "${BUILDER_PLATFORM}" \
|
||||||
|
-v "${REPO_ROOT}:/work" \
|
||||||
|
-v "${CACHE_DIR}:/cache" \
|
||||||
|
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||||
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
|
-e GOCACHE=/cache/go-build \
|
||||||
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
|
-e TMPDIR=/cache/tmp \
|
||||||
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-w /work \
|
||||||
|
"${IMAGE_REF}" \
|
||||||
|
sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
"$CONTAINER_TOOL" "$@"
|
||||||
138
iso/builder/build-nccl-tests.sh
Executable file
138
iso/builder/build-nccl-tests.sh
Executable file
@@ -0,0 +1,138 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# build-nccl-tests.sh — build nccl-tests all_reduce_perf for the LiveCD.
|
||||||
|
#
|
||||||
|
# Downloads nccl-tests source from GitHub, downloads libnccl-dev .deb for
|
||||||
|
# nccl.h, and compiles all_reduce_perf with nvcc (cuda-nvcc-13-0).
|
||||||
|
#
|
||||||
|
# Output is cached in DIST_DIR/nccl-tests-<version>/ so subsequent builds
|
||||||
|
# are instant unless NCCL_TESTS_VERSION changes.
|
||||||
|
#
|
||||||
|
# Output layout:
|
||||||
|
# $CACHE_DIR/bin/all_reduce_perf
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
NCCL_TESTS_VERSION="$1"
|
||||||
|
NCCL_VERSION="$2"
|
||||||
|
NCCL_CUDA_VERSION="$3"
|
||||||
|
DIST_DIR="$4"
|
||||||
|
|
||||||
|
[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||||
|
|
||||||
|
echo "=== nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||||
|
|
||||||
|
CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||||
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
||||||
|
|
||||||
|
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
|
||||||
|
echo "=== nccl-tests cached, skipping build ==="
|
||||||
|
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Resolve nvcc path (cuda-nvcc-12-8 installs to /usr/local/cuda-12.8/bin/nvcc)
|
||||||
|
NVCC=""
|
||||||
|
for candidate in nvcc /usr/local/cuda-12.8/bin/nvcc /usr/local/cuda-12/bin/nvcc /usr/local/cuda/bin/nvcc; do
|
||||||
|
if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then
|
||||||
|
NVCC="$candidate"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-13-0"; exit 1; }
|
||||||
|
echo "nvcc: $NVCC"
|
||||||
|
|
||||||
|
# Determine CUDA_HOME from nvcc location
|
||||||
|
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
||||||
|
echo "CUDA_HOME: $CUDA_HOME"
|
||||||
|
|
||||||
|
# Download libnccl-dev for nccl.h
|
||||||
|
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||||
|
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
||||||
|
DEV_URL="${REPO_BASE}/${DEV_PKG}"
|
||||||
|
|
||||||
|
mkdir -p "$DOWNLOAD_CACHE_DIR"
|
||||||
|
DEV_DEB="${DOWNLOAD_CACHE_DIR}/${DEV_PKG}"
|
||||||
|
|
||||||
|
if [ ! -f "$DEV_DEB" ]; then
|
||||||
|
echo "=== downloading libnccl-dev ==="
|
||||||
|
wget --show-progress -O "$DEV_DEB" "$DEV_URL"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract nccl.h from libnccl-dev
|
||||||
|
NCCL_INCLUDE_TMP=$(mktemp -d)
|
||||||
|
trap 'rm -rf "$NCCL_INCLUDE_TMP" "$BUILD_TMP"' EXIT INT TERM
|
||||||
|
|
||||||
|
cd "$NCCL_INCLUDE_TMP"
|
||||||
|
ar x "$DEV_DEB"
|
||||||
|
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
|
||||||
|
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in libnccl-dev .deb"; exit 1; }
|
||||||
|
tar xf "$DATA_TAR"
|
||||||
|
|
||||||
|
# nccl.h lands in ./usr/include/ or ./usr/local/cuda-X.Y/targets/.../include/
|
||||||
|
NCCL_H=$(find . -name 'nccl.h' -type f 2>/dev/null | head -1)
|
||||||
|
[ -n "$NCCL_H" ] || { echo "ERROR: nccl.h not found in libnccl-dev package"; exit 1; }
|
||||||
|
NCCL_INCLUDE_DIR="$(pwd)/$(dirname "$NCCL_H")"
|
||||||
|
echo "nccl.h: $NCCL_H"
|
||||||
|
|
||||||
|
# libnccl.so comes from the already-built NCCL cache (build-nccl.sh ran first)
|
||||||
|
NCCL_LIB_DIR="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}/lib"
|
||||||
|
[ -d "$NCCL_LIB_DIR" ] || { echo "ERROR: NCCL lib dir not found at $NCCL_LIB_DIR — run build-nccl.sh first"; exit 1; }
|
||||||
|
echo "nccl lib: $NCCL_LIB_DIR"
|
||||||
|
|
||||||
|
# Download nccl-tests source
|
||||||
|
SRC_TAR="${DOWNLOAD_CACHE_DIR}/nccl-tests-v${NCCL_TESTS_VERSION}.tar.gz"
|
||||||
|
SRC_URL="https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz"
|
||||||
|
|
||||||
|
if [ ! -f "$SRC_TAR" ]; then
|
||||||
|
echo "=== downloading nccl-tests v${NCCL_TESTS_VERSION} ==="
|
||||||
|
wget --show-progress -O "$SRC_TAR" "$SRC_URL"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract and build
|
||||||
|
BUILD_TMP=$(mktemp -d)
|
||||||
|
cd "$BUILD_TMP"
|
||||||
|
tar xf "$SRC_TAR"
|
||||||
|
SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1)
|
||||||
|
[ -n "$SRC_DIR" ] || { echo "ERROR: source directory not found in archive"; exit 1; }
|
||||||
|
cd "$SRC_DIR"
|
||||||
|
|
||||||
|
echo "=== building all_reduce_perf ==="
|
||||||
|
# Pick gencode based on the actual nvcc version:
|
||||||
|
# CUDA 12.x — Volta..Blackwell (sm_70..sm_100)
|
||||||
|
# CUDA 13.x — Hopper..Blackwell (sm_90..sm_100, Pascal/Volta/Ampere dropped)
|
||||||
|
NVCC_MAJOR=$("$NVCC" --version 2>/dev/null | grep -oE 'release [0-9]+' | awk '{print $2}' | head -1)
|
||||||
|
echo "nvcc major version: ${NVCC_MAJOR:-unknown}"
|
||||||
|
if [ "${NVCC_MAJOR:-0}" -ge 13 ] 2>/dev/null; then
|
||||||
|
GENCODE="-gencode=arch=compute_90,code=sm_90 \
|
||||||
|
-gencode=arch=compute_100,code=sm_100"
|
||||||
|
echo "gencode: sm_90 sm_100 (CUDA 13+)"
|
||||||
|
else
|
||||||
|
GENCODE="-gencode=arch=compute_70,code=sm_70 \
|
||||||
|
-gencode=arch=compute_80,code=sm_80 \
|
||||||
|
-gencode=arch=compute_86,code=sm_86 \
|
||||||
|
-gencode=arch=compute_90,code=sm_90 \
|
||||||
|
-gencode=arch=compute_100,code=sm_100"
|
||||||
|
echo "gencode: sm_70..sm_100 (CUDA 12)"
|
||||||
|
fi
|
||||||
|
LIBRARY_PATH="$NCCL_LIB_DIR${LIBRARY_PATH:+:$LIBRARY_PATH}" \
|
||||||
|
make MPI=0 \
|
||||||
|
NVCC="$NVCC" \
|
||||||
|
CUDA_HOME="$CUDA_HOME" \
|
||||||
|
NCCL_HOME="$NCCL_INCLUDE_DIR/.." \
|
||||||
|
NCCL_LIB="$NCCL_LIB_DIR" \
|
||||||
|
NVCC_GENCODE="$GENCODE" \
|
||||||
|
BUILDDIR="./build"
|
||||||
|
|
||||||
|
[ -f "./build/all_reduce_perf" ] || { echo "ERROR: all_reduce_perf not found after build"; exit 1; }
|
||||||
|
|
||||||
|
mkdir -p "${CACHE_DIR}/bin"
|
||||||
|
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
|
||||||
|
echo "=== nccl-tests build complete ==="
|
||||||
|
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
94
iso/builder/build-nccl.sh
Executable file
94
iso/builder/build-nccl.sh
Executable file
@@ -0,0 +1,94 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# build-nccl.sh — download and extract NCCL shared library for the LiveCD.
|
||||||
|
#
|
||||||
|
# Downloads libnccl2 .deb from NVIDIA's CUDA apt repository (Debian 12, x86_64)
|
||||||
|
# and extracts the shared library. Package integrity verified via sha256.
|
||||||
|
#
|
||||||
|
# Output is cached in DIST_DIR/nccl-<version>+cuda<cuda>/ so subsequent builds
|
||||||
|
# are instant unless NCCL_VERSION or NCCL_CUDA_VERSION changes.
|
||||||
|
#
|
||||||
|
# Output layout:
|
||||||
|
# $CACHE_DIR/lib/ — libnccl.so.* files
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
NCCL_VERSION="$1"
|
||||||
|
NCCL_CUDA_VERSION="$2"
|
||||||
|
DIST_DIR="$3"
|
||||||
|
EXPECTED_SHA256="$4"
|
||||||
|
|
||||||
|
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
|
||||||
|
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
|
||||||
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
|
||||||
|
|
||||||
|
echo "=== NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||||
|
|
||||||
|
CACHE_DIR="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-downloads"
|
||||||
|
|
||||||
|
if [ -d "$CACHE_DIR/lib" ] && [ "$(ls "$CACHE_DIR/lib/"libnccl.so.* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||||
|
echo "=== NCCL cached, skipping download ==="
|
||||||
|
echo "cache: $CACHE_DIR"
|
||||||
|
echo "libs: $(ls "$CACHE_DIR/lib/" | wc -l) files"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||||
|
PKG_NAME="libnccl2_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
||||||
|
PKG_URL="${REPO_BASE}/${PKG_NAME}"
|
||||||
|
|
||||||
|
mkdir -p "$DOWNLOAD_CACHE_DIR"
|
||||||
|
DEB_FILE="${DOWNLOAD_CACHE_DIR}/${PKG_NAME}"
|
||||||
|
|
||||||
|
echo "=== downloading NCCL package ==="
|
||||||
|
echo "URL: ${PKG_URL}"
|
||||||
|
wget --show-progress -O "$DEB_FILE" "$PKG_URL"
|
||||||
|
|
||||||
|
if [ -n "$EXPECTED_SHA256" ]; then
|
||||||
|
echo "=== verifying sha256 ==="
|
||||||
|
ACTUAL_SHA256=$(sha256sum "$DEB_FILE" | awk '{print $1}')
|
||||||
|
if [ "$ACTUAL_SHA256" != "$EXPECTED_SHA256" ]; then
|
||||||
|
echo "ERROR: sha256 mismatch"
|
||||||
|
echo " expected: $EXPECTED_SHA256"
|
||||||
|
echo " actual: $ACTUAL_SHA256"
|
||||||
|
rm -f "$DEB_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "sha256 OK"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== extracting NCCL libraries ==="
|
||||||
|
EXTRACT_TMP=$(mktemp -d)
|
||||||
|
trap 'rm -rf "$EXTRACT_TMP"' EXIT INT TERM
|
||||||
|
|
||||||
|
# .deb is an ar archive; data.tar.* contains the actual files
|
||||||
|
cd "$EXTRACT_TMP"
|
||||||
|
ar x "$DEB_FILE"
|
||||||
|
|
||||||
|
# Extract data tarball (xz, gz, or zst)
|
||||||
|
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
|
||||||
|
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in .deb"; exit 1; }
|
||||||
|
tar xf "$DATA_TAR"
|
||||||
|
|
||||||
|
# Library lands in ./usr/lib/x86_64-linux-gnu/ or ./usr/lib/
|
||||||
|
mkdir -p "$CACHE_DIR/lib"
|
||||||
|
found=0
|
||||||
|
for f in $(find . -name 'libnccl.so.*' -not -type d 2>/dev/null); do
|
||||||
|
cp "$f" "$CACHE_DIR/lib/"
|
||||||
|
found=$((found + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
[ "$found" -gt 0 ] || { echo "ERROR: libnccl.so.* not found in package"; exit 1; }
|
||||||
|
|
||||||
|
# Create soname symlinks: libnccl.so.2 -> libnccl.so.<full>, libnccl.so -> libnccl.so.2
|
||||||
|
versioned=$(ls "$CACHE_DIR/lib/libnccl.so."[0-9][0-9.]* 2>/dev/null | head -1)
|
||||||
|
if [ -n "$versioned" ]; then
|
||||||
|
base=$(basename "$versioned")
|
||||||
|
ln -sf "$base" "$CACHE_DIR/lib/libnccl.so.2" 2>/dev/null || true
|
||||||
|
ln -sf "libnccl.so.2" "$CACHE_DIR/lib/libnccl.so" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== NCCL extraction complete ==="
|
||||||
|
echo "cache: $CACHE_DIR"
|
||||||
|
ls -lh "$CACHE_DIR/lib/"
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# build-nvidia-module.sh — install NVIDIA proprietary driver into ISO overlay
|
# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
|
||||||
#
|
#
|
||||||
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
||||||
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
|
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
|
||||||
@@ -16,55 +16,54 @@ set -e
|
|||||||
|
|
||||||
NVIDIA_VERSION="$1"
|
NVIDIA_VERSION="$1"
|
||||||
DIST_DIR="$2"
|
DIST_DIR="$2"
|
||||||
ALPINE_VERSION="$3"
|
DEBIAN_KERNEL_ABI="$3"
|
||||||
|
|
||||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <alpine-version>"; exit 1; }
|
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <alpine-version>"; exit 1; }
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||||
[ -n "$ALPINE_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <alpine-version>"; exit 1; }
|
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||||
|
|
||||||
# Install linux-lts-dev (no version pin — always use whatever is current in Alpine 3.21 main).
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
# This ensures modules are compiled for the same kernel that mkimage will install in the ISO.
|
# On Debian, kernel headers are split into two packages:
|
||||||
# Both use dl-cdn.alpinelinux.org, so they see the same package state at build time.
|
# linux-headers-<kver> — arch-specific (generated, Makefile)
|
||||||
echo "=== installing linux-lts-dev (latest from dl-cdn) ==="
|
# linux-headers-<kver>-common — common source headers (linux/, asm-generic/, etc.)
|
||||||
apk add --quiet --update \
|
# NVIDIA conftest needs SYSSRC pointing to common (for source headers like linux/mm.h)
|
||||||
--repository "https://dl-cdn.alpinelinux.org/alpine/v${ALPINE_VERSION}/main" \
|
# and SYSOUT pointing to amd64 (for generated headers like autoconf.h, asm/).
|
||||||
linux-lts-dev
|
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
||||||
|
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
||||||
|
|
||||||
# Detect kernel version from installed headers (pick highest version if multiple).
|
|
||||||
detect_kver() {
|
|
||||||
ls /usr/src/ 2>/dev/null \
|
|
||||||
| grep '^linux-headers-' \
|
|
||||||
| sed 's/linux-headers-//' \
|
|
||||||
| sort -V \
|
|
||||||
| tail -1
|
|
||||||
}
|
|
||||||
|
|
||||||
KVER="$(detect_kver)"
|
|
||||||
KDIR="/usr/src/linux-headers-${KVER}"
|
|
||||||
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
|
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
|
||||||
|
|
||||||
|
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
||||||
|
echo "=== installing linux-headers-${KVER} ==="
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||||
|
"linux-headers-${KVER}" \
|
||||||
|
gcc make perl
|
||||||
|
fi
|
||||||
|
echo "kernel headers (arch): $KDIR_ARCH"
|
||||||
|
echo "kernel headers (common): $KDIR_COMMON"
|
||||||
|
|
||||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
||||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||||
|
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||||
|
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||||
|
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||||
echo "=== NVIDIA cached, skipping build ==="
|
echo "=== NVIDIA cached, skipping build ==="
|
||||||
echo "cache: $CACHE_DIR"
|
echo "cache: $CACHE_DIR"
|
||||||
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
|
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Install build dependencies (linux-lts-dev already at correct version from above)
|
# Download official NVIDIA .run installer with sha256 verification
|
||||||
apk add --quiet \
|
|
||||||
--repository "https://dl-cdn.alpinelinux.org/alpine/v${ALPINE_VERSION}/main" \
|
|
||||||
gcc make perl linux-lts-dev wget
|
|
||||||
|
|
||||||
# Download official NVIDIA .run installer (proprietary) with sha256 verification
|
|
||||||
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
||||||
RUN_FILE="/var/tmp/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
|
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
||||||
SHA_FILE="/var/tmp/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run.sha256sum"
|
RUN_FILE="${DOWNLOAD_CACHE_DIR}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
|
||||||
|
SHA_FILE="${DOWNLOAD_CACHE_DIR}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run.sha256sum"
|
||||||
|
|
||||||
verify_run() {
|
verify_run() {
|
||||||
[ -s "$SHA_FILE" ] || return 1
|
[ -s "$SHA_FILE" ] || return 1
|
||||||
[ -s "$RUN_FILE" ] || return 1
|
[ -s "$RUN_FILE" ] || return 1
|
||||||
cd /var/tmp
|
cd "$DOWNLOAD_CACHE_DIR"
|
||||||
sha256sum -c "$SHA_FILE" --status 2>/dev/null
|
sha256sum -c "$SHA_FILE" --status 2>/dev/null
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -75,7 +74,7 @@ if ! verify_run; then
|
|||||||
echo "sha256: $(cat "$SHA_FILE")"
|
echo "sha256: $(cat "$SHA_FILE")"
|
||||||
wget --show-progress -O "$RUN_FILE" "${BASE_URL}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
|
wget --show-progress -O "$RUN_FILE" "${BASE_URL}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
|
||||||
echo "=== verifying sha256 ==="
|
echo "=== verifying sha256 ==="
|
||||||
cd /var/tmp && sha256sum -c "$SHA_FILE" || { echo "ERROR: sha256 mismatch"; rm -f "$RUN_FILE"; exit 1; }
|
cd "$DOWNLOAD_CACHE_DIR" && sha256sum -c "$SHA_FILE" || { echo "ERROR: sha256 mismatch"; rm -f "$RUN_FILE"; exit 1; }
|
||||||
echo "sha256 OK"
|
echo "sha256 OK"
|
||||||
else
|
else
|
||||||
echo "=== NVIDIA installer verified from cache ==="
|
echo "=== NVIDIA installer verified from cache ==="
|
||||||
@@ -84,7 +83,7 @@ fi
|
|||||||
# Extract installer contents
|
# Extract installer contents
|
||||||
echo "=== extracting installer ==="
|
echo "=== extracting installer ==="
|
||||||
chmod +x "$RUN_FILE"
|
chmod +x "$RUN_FILE"
|
||||||
EXTRACT_DIR="/var/tmp/nvidia-extract-${NVIDIA_VERSION}"
|
EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
|
||||||
rm -rf "$EXTRACT_DIR"
|
rm -rf "$EXTRACT_DIR"
|
||||||
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
||||||
|
|
||||||
@@ -96,10 +95,20 @@ done
|
|||||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||||
echo "kernel source: $KERNEL_SRC"
|
echo "kernel source: $KERNEL_SRC"
|
||||||
|
|
||||||
# Build kernel modules from extracted source
|
# Build kernel modules
|
||||||
|
# CFLAGS_MODULE: add GCC include dir so NVIDIA's nv_stdarg.h can find stdarg.h.
|
||||||
|
# Kernel build uses -nostdinc which strips GCC's own includes; we restore it here.
|
||||||
echo "=== building kernel modules ($(nproc) cores) ==="
|
echo "=== building kernel modules ($(nproc) cores) ==="
|
||||||
cd "$KERNEL_SRC"
|
cd "$KERNEL_SRC"
|
||||||
make -j$(nproc) KERNEL_UNAME="$KVER" SYSSRC="$KDIR" modules 2>&1 | tail -5
|
# SYSSRC=common: conftest finds real kernel headers (linux/mm.h etc.)
|
||||||
|
# SYSOUT=amd64: generated headers (autoconf.h, asm/) from arch package
|
||||||
|
# Without this split, conftest uses amd64/include/ which is nearly empty,
|
||||||
|
# all compile-tests fail silently, and NVIDIA assumes all APIs present → link errors.
|
||||||
|
make -j$(nproc) \
|
||||||
|
KERNEL_UNAME="$KVER" \
|
||||||
|
SYSSRC="$KDIR_COMMON" \
|
||||||
|
SYSOUT="$KDIR_ARCH" \
|
||||||
|
modules 2>&1 | tail -10
|
||||||
|
|
||||||
# Collect outputs
|
# Collect outputs
|
||||||
mkdir -p "$CACHE_DIR/modules" "$CACHE_DIR/bin" "$CACHE_DIR/lib"
|
mkdir -p "$CACHE_DIR/modules" "$CACHE_DIR/bin" "$CACHE_DIR/lib"
|
||||||
@@ -112,32 +121,42 @@ done
|
|||||||
cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/"
|
cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/"
|
||||||
cp "$EXTRACT_DIR/nvidia-bug-report.sh" "$CACHE_DIR/bin/" 2>/dev/null || true
|
cp "$EXTRACT_DIR/nvidia-bug-report.sh" "$CACHE_DIR/bin/" 2>/dev/null || true
|
||||||
|
|
||||||
# Copy userspace libraries — use find to handle any versioning scheme (libnvidia-ml.so.X.Y.Z or .so.1)
|
# Copy GSP firmware (required for Hopper/Ada GPUs — H100, H800, etc.)
|
||||||
for lib in libnvidia-ml libcuda; do
|
mkdir -p "$CACHE_DIR/firmware"
|
||||||
found=$(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" | head -1)
|
if [ -d "$EXTRACT_DIR/firmware" ]; then
|
||||||
if [ -z "$found" ]; then
|
cp -r "$EXTRACT_DIR/firmware/." "$CACHE_DIR/firmware/"
|
||||||
|
echo "firmware: $(ls "$CACHE_DIR/firmware/" | wc -l) files"
|
||||||
|
else
|
||||||
|
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Copy ALL userspace library files.
|
||||||
|
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
||||||
|
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
||||||
|
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
||||||
|
count=0
|
||||||
|
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
||||||
|
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
||||||
|
done
|
||||||
|
if [ "$count" -eq 0 ]; then
|
||||||
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
|
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
|
||||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
|
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
cp "$found" "$CACHE_DIR/lib/"
|
|
||||||
done
|
done
|
||||||
|
|
||||||
# Verify .ko files were actually built
|
# Verify .ko files were built
|
||||||
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||||
|
|
||||||
# Create soname symlinks required by nvidia-smi on Alpine (musl/glibc via gcompat + libc6-compat)
|
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
||||||
for lib in libnvidia-ml libcuda; do
|
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
||||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."* 2>/dev/null | grep -v '\.so\.1$' | head -1)
|
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
||||||
[ -n "$versioned" ] || versioned=$(ls "$CACHE_DIR/lib/${lib}.so."* 2>/dev/null | head -1)
|
|
||||||
[ -n "$versioned" ] || continue
|
[ -n "$versioned" ] || continue
|
||||||
base=$(basename "$versioned")
|
base=$(basename "$versioned")
|
||||||
# Only create .so.1 if versioned file is not already named .so.1
|
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
|
||||||
if [ "$base" != "${lib}.so.1" ]; then
|
|
||||||
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
|
|
||||||
fi
|
|
||||||
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
||||||
|
echo "${lib}: .so.1 -> $base"
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "=== NVIDIA build complete ==="
|
echo "=== NVIDIA build complete ==="
|
||||||
|
|||||||
@@ -1,19 +1,21 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# build.sh — build bee ISO
|
# build.sh — internal ISO build entrypoint executed inside the builder container.
|
||||||
#
|
|
||||||
# Single build script. Produces a bootable live ISO with SSH access, TUI, NVIDIA drivers.
|
|
||||||
#
|
|
||||||
# Run on Alpine builder VM as root after setup-builder.sh.
|
|
||||||
# Usage:
|
|
||||||
# sh iso/builder/build.sh [--authorized-keys /path/to/authorized_keys]
|
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
if [ "${BEE_CONTAINER_BUILD:-0}" != "1" ]; then
|
||||||
|
echo "build.sh must run inside iso/builder/build-in-container.sh" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||||
BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
||||||
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
|
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
|
||||||
DIST_DIR="${REPO_ROOT}/dist"
|
DIST_DIR="${REPO_ROOT}/dist"
|
||||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||||
|
BUILD_WORK_DIR="${DIST_DIR}/live-build-work"
|
||||||
|
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage"
|
||||||
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
|
|
||||||
# parse args
|
# parse args
|
||||||
@@ -26,49 +28,196 @@ done
|
|||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
export PATH="$PATH:/usr/local/go/bin"
|
export PATH="$PATH:/usr/local/go/bin"
|
||||||
|
mkdir -p "${DIST_DIR}"
|
||||||
|
mkdir -p "${CACHE_ROOT}"
|
||||||
|
: "${GOCACHE:=${CACHE_ROOT}/go-build}"
|
||||||
|
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
|
||||||
|
export GOCACHE GOMODCACHE
|
||||||
|
|
||||||
# NOTE: lz4 compression for modloop is disabled — Alpine initramfs may not support lz4 squashfs.
|
resolve_audit_version() {
|
||||||
# Default xz compression is used until lz4 support is confirmed.
|
if [ -n "${BEE_AUDIT_VERSION:-}" ]; then
|
||||||
|
echo "${BEE_AUDIT_VERSION}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||||
|
if [ -z "${tag}" ]; then
|
||||||
|
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||||
|
fi
|
||||||
|
case "${tag}" in
|
||||||
|
audit/v*)
|
||||||
|
echo "${tag#audit/v}"
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
v*)
|
||||||
|
echo "${tag#v}"
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
"")
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "${tag}"
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [ -n "${AUDIT_VERSION:-}" ]; then
|
||||||
|
echo "${AUDIT_VERSION}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
date +%Y%m%d
|
||||||
|
}
|
||||||
|
|
||||||
|
# ISO image versioned separately from the audit binary (iso/v* tags).
|
||||||
|
resolve_iso_version() {
|
||||||
|
if [ -n "${BEE_ISO_VERSION:-}" ]; then
|
||||||
|
echo "${BEE_ISO_VERSION}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'iso/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||||
|
case "${tag}" in
|
||||||
|
iso/v*)
|
||||||
|
echo "${tag#iso/v}"
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# Also accept plain v* tags (e.g. v2, v2.1 used for GUI releases)
|
||||||
|
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||||
|
case "${tag}" in
|
||||||
|
v*)
|
||||||
|
echo "${tag#v}"
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# Fall back to audit version so the name is still meaningful
|
||||||
|
resolve_audit_version
|
||||||
|
}
|
||||||
|
|
||||||
|
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||||
|
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||||
|
|
||||||
|
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
|
||||||
|
# If headers for the detected ABI are not yet installed (kernel updated since image build),
|
||||||
|
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
||||||
|
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
||||||
|
echo "=== refreshing apt index to detect current kernel ABI ==="
|
||||||
|
apt-get update -qq
|
||||||
|
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
||||||
|
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
||||||
|
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
||||||
|
| head -1)
|
||||||
|
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
||||||
|
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Export detected ABI so that auto/config can pin the exact kernel package
|
||||||
|
# (prevents NVIDIA module/kernel mismatch if linux-image-amd64 meta-package
|
||||||
|
# gets updated between build.sh start and lb build chroot step)
|
||||||
|
export BEE_KERNEL_ABI="${DEBIAN_KERNEL_ABI}"
|
||||||
|
|
||||||
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
|
if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
||||||
|
echo "=== installing linux-headers-${KVER} (kernel updated since image build) ==="
|
||||||
|
apt-get install -y "linux-headers-${KVER}"
|
||||||
|
fi
|
||||||
|
|
||||||
echo "=== bee ISO build ==="
|
echo "=== bee ISO build ==="
|
||||||
echo "Alpine: ${ALPINE_VERSION}, Go: ${GO_VERSION}"
|
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||||
|
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
# --- compile audit binary (static, Linux amd64) ---
|
echo "=== syncing git submodules ==="
|
||||||
# Skip rebuild if binary is newer than all Go source files.
|
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||||
AUDIT_BIN="${DIST_DIR}/bee-audit-linux-amd64"
|
|
||||||
|
# --- compile bee binary (static, Linux amd64) ---
|
||||||
|
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
||||||
|
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||||
NEED_BUILD=1
|
NEED_BUILD=1
|
||||||
if [ -f "$AUDIT_BIN" ]; then
|
if [ -f "$BEE_BIN" ]; then
|
||||||
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$AUDIT_BIN" | head -1)
|
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
||||||
[ -z "$NEWEST_SRC" ] && NEED_BUILD=0
|
[ -z "$NEWEST_SRC" ] && NEED_BUILD=0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$NEED_BUILD" = "1" ]; then
|
if [ "$NEED_BUILD" = "1" ]; then
|
||||||
echo "=== building audit binary ==="
|
echo "=== building bee binary ==="
|
||||||
cd "${REPO_ROOT}/audit"
|
cd "${REPO_ROOT}/audit"
|
||||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
||||||
go build \
|
go build \
|
||||||
-ldflags "-s -w -X main.Version=${AUDIT_VERSION:-$(date +%Y%m%d)}" \
|
-ldflags "-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}" \
|
||||||
-o "$AUDIT_BIN" \
|
-o "$BEE_BIN" \
|
||||||
./cmd/audit
|
./cmd/bee
|
||||||
echo "binary: $AUDIT_BIN"
|
echo "binary: $BEE_BIN"
|
||||||
echo "size: $(du -sh "$AUDIT_BIN" | cut -f1)"
|
if command -v stat >/dev/null 2>&1; then
|
||||||
|
BEE_SIZE_BYTES="$(stat -c '%s' "$BEE_BIN" 2>/dev/null || stat -f '%z' "$BEE_BIN")"
|
||||||
|
else
|
||||||
|
BEE_SIZE_BYTES="$(wc -c < "$BEE_BIN" | tr -d ' ')"
|
||||||
|
fi
|
||||||
|
if command -v numfmt >/dev/null 2>&1; then
|
||||||
|
echo "size: $(numfmt --to=iec --suffix=B "$BEE_SIZE_BYTES")"
|
||||||
|
else
|
||||||
|
echo "size: ${BEE_SIZE_BYTES} bytes"
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo "=== audit binary up to date, skipping build ==="
|
echo "=== bee binary up to date, skipping build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||||
|
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||||
|
"${CUBLAS_VERSION}" \
|
||||||
|
"${CUDA_USERSPACE_VERSION}" \
|
||||||
|
"${NCCL_CUDA_VERSION}" \
|
||||||
|
"${DIST_DIR}"
|
||||||
|
|
||||||
|
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
|
GPU_STRESS_NEED_BUILD=1
|
||||||
|
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||||
|
GPU_STRESS_NEED_BUILD=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||||
|
echo "=== building bee-gpu-stress ==="
|
||||||
|
gcc -O2 -s -Wall -Wextra \
|
||||||
|
-I"${CUBLAS_CACHE}/include" \
|
||||||
|
-o "$GPU_STRESS_BIN" \
|
||||||
|
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||||
|
-ldl -lm
|
||||||
|
echo "binary: $GPU_STRESS_BIN"
|
||||||
|
else
|
||||||
|
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== preparing staged overlay ==="
|
||||||
|
rm -rf "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||||
|
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||||
|
rsync -a "${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
||||||
|
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||||
|
rm -f \
|
||||||
|
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
|
||||||
# --- inject authorized_keys for SSH access ---
|
# --- inject authorized_keys for SSH access ---
|
||||||
# Uses the same Ed25519 keys as release signing (from git.mchus.pro/mchus/keys).
|
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
||||||
# SSH public keys are stored alongside signing keys as ~/.keys/<name>.key.pub
|
mkdir -p "${OVERLAY_STAGE_DIR}/root/.ssh"
|
||||||
AUTHORIZED_KEYS_FILE="${OVERLAY_DIR}/root/.ssh/authorized_keys"
|
|
||||||
mkdir -p "${OVERLAY_DIR}/root/.ssh"
|
|
||||||
|
|
||||||
if [ -n "$AUTH_KEYS" ]; then
|
if [ -n "$AUTH_KEYS" ]; then
|
||||||
cp "$AUTH_KEYS" "$AUTHORIZED_KEYS_FILE"
|
cp "$AUTH_KEYS" "$AUTHORIZED_KEYS_FILE"
|
||||||
chmod 600 "$AUTHORIZED_KEYS_FILE"
|
chmod 600 "$AUTHORIZED_KEYS_FILE"
|
||||||
echo "SSH authorized_keys: installed from $AUTH_KEYS"
|
echo "SSH authorized_keys: installed from $AUTH_KEYS"
|
||||||
else
|
else
|
||||||
# auto-collect all developer SSH public keys from ~/.keys/*.key.pub
|
|
||||||
> "$AUTHORIZED_KEYS_FILE"
|
> "$AUTHORIZED_KEYS_FILE"
|
||||||
FOUND=0
|
FOUND=0
|
||||||
for ssh_pub in "$HOME"/.keys/*.key.pub; do
|
for ssh_pub in "$HOME"/.keys/*.key.pub; do
|
||||||
@@ -82,127 +231,167 @@ else
|
|||||||
echo "SSH authorized_keys: $FOUND key(s) from ~/.keys/*.key.pub"
|
echo "SSH authorized_keys: $FOUND key(s) from ~/.keys/*.key.pub"
|
||||||
else
|
else
|
||||||
echo "WARNING: no SSH public keys found — falling back to password auth"
|
echo "WARNING: no SSH public keys found — falling back to password auth"
|
||||||
echo " SSH login: bee / eeb (user created by bee-sshsetup at boot)"
|
echo " SSH login: bee / eeb"
|
||||||
echo " (generate a key with: sh keys/scripts/keygen.sh <your-name>)"
|
|
||||||
USE_PASSWORD_FALLBACK=1
|
USE_PASSWORD_FALLBACK=1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- password fallback: write marker file read by init script ---
|
|
||||||
if [ "${USE_PASSWORD_FALLBACK:-0}" = "1" ]; then
|
if [ "${USE_PASSWORD_FALLBACK:-0}" = "1" ]; then
|
||||||
touch "${OVERLAY_DIR}/etc/bee-ssh-password-fallback"
|
touch "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback"
|
||||||
|
else
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- copy audit binary into overlay ---
|
# --- copy bee binary into overlay ---
|
||||||
mkdir -p "${OVERLAY_DIR}/usr/local/bin"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||||
cp "${DIST_DIR}/bee-audit-linux-amd64" "${OVERLAY_DIR}/usr/local/bin/audit"
|
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
chmod +x "${OVERLAY_DIR}/usr/local/bin/audit"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
|
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||||
|
|
||||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||||
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_DIR}/usr/local/bin/bee-smoketest"
|
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||||
chmod +x "${OVERLAY_DIR}/usr/local/bin/bee-smoketest"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||||
|
|
||||||
# --- vendor utilities (optional pre-fetched binaries) ---
|
# --- vendor utilities (optional pre-fetched binaries) ---
|
||||||
for tool in storcli64 sas2ircu sas3ircu mstflint; do
|
for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
||||||
if [ -f "${VENDOR_DIR}/${tool}" ]; then
|
if [ -f "${VENDOR_DIR}/${tool}" ]; then
|
||||||
cp "${VENDOR_DIR}/${tool}" "${OVERLAY_DIR}/usr/local/bin/${tool}"
|
cp "${VENDOR_DIR}/${tool}" "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}"
|
||||||
chmod +x "${OVERLAY_DIR}/usr/local/bin/${tool}" || true
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}" || true
|
||||||
echo "vendor tool: ${tool} (included)"
|
echo "vendor tool: ${tool} (included)"
|
||||||
else
|
else
|
||||||
echo "vendor tool: ${tool} (not found, skipped)"
|
echo "vendor tool: ${tool} (not found, skipped)"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# --- build NVIDIA kernel modules and inject into overlay ---
|
# --- build NVIDIA kernel modules ---
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${ALPINE_VERSION}"
|
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||||
|
|
||||||
# Detect kernel version from installed headers (set by build-nvidia-module.sh above)
|
|
||||||
KVER=$(ls /usr/src/ 2>/dev/null | grep '^linux-headers-' | sed 's/linux-headers-//' | sort -V | tail -1)
|
|
||||||
[ -n "$KVER" ] || { echo "ERROR: linux-lts-dev not installed — no headers in /usr/src/"; exit 1; }
|
|
||||||
echo "=== kernel version: ${KVER} ==="
|
|
||||||
|
|
||||||
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||||
|
|
||||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/ (not /lib/modules/ — modloop squashfs
|
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||||
# mounts over that path at boot and makes it read-only, so overlay content there is inaccessible)
|
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia"
|
OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia"
|
||||||
|
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||||
mkdir -p "${OVERLAY_KMOD_DIR}"
|
mkdir -p "${OVERLAY_KMOD_DIR}"
|
||||||
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
||||||
|
|
||||||
# Inject nvidia-smi and libnvidia-ml
|
# Inject nvidia-smi and libnvidia-ml
|
||||||
mkdir -p "${OVERLAY_DIR}/usr/local/bin" "${OVERLAY_DIR}/usr/lib"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
||||||
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_DIR}/usr/local/bin/"
|
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||||
chmod +x "${OVERLAY_DIR}/usr/local/bin/nvidia-smi"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
||||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_DIR}/usr/local/bin/" 2>/dev/null || true
|
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||||
chmod +x "${OVERLAY_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_DIR}/usr/lib/" 2>/dev/null || true
|
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||||
|
|
||||||
|
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||||
# --- embed build metadata ---
|
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||||
mkdir -p "${OVERLAY_DIR}/etc"
|
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
||||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
||||||
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
|
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
||||||
cat > "${OVERLAY_DIR}/etc/bee-release" <<EOF
|
|
||||||
BEE_ISO_VERSION=${AUDIT_VERSION}
|
|
||||||
BEE_AUDIT_VERSION=${AUDIT_VERSION}
|
|
||||||
BUILD_DATE=${BUILD_DATE}
|
|
||||||
GIT_COMMIT=${GIT_COMMIT}
|
|
||||||
ALPINE_VERSION=${ALPINE_VERSION}
|
|
||||||
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# --- export build info for genapkovl to inject into motd ---
|
|
||||||
BUILD_DATE=$(date +%Y-%m-%d)
|
|
||||||
GIT_COMMIT=$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo "unknown")
|
|
||||||
export BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} alpine:${ALPINE_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}"
|
|
||||||
|
|
||||||
# --- build ISO using mkimage ---
|
|
||||||
mkdir -p "${DIST_DIR}"
|
|
||||||
echo ""
|
|
||||||
echo "=== building ISO ==="
|
|
||||||
|
|
||||||
# Install our mkimage profile where mkimage.sh can find it.
|
|
||||||
# ~/.mkimage is the user plugin directory loaded by mkimage.sh.
|
|
||||||
# Clear ~/.mkimage to avoid stale profiles from previous builds being picked up
|
|
||||||
rm -rf "${HOME}/.mkimage"
|
|
||||||
mkdir -p "${HOME}/.mkimage"
|
|
||||||
cp "${BUILDER_DIR}/mkimg.bee.sh" "${HOME}/.mkimage/"
|
|
||||||
cp "${BUILDER_DIR}/genapkovl-bee.sh" "${HOME}/.mkimage/"
|
|
||||||
|
|
||||||
# Export overlay dir so the profile script can find it regardless of SRCDIR.
|
|
||||||
export BEE_OVERLAY_DIR="${OVERLAY_DIR}"
|
|
||||||
|
|
||||||
# Clean workdir: always nuke apks_* (stale packages from old mirror/version cause "unable to select" errors).
|
|
||||||
# Keep kernel_*, syslinux_*, grub_* — these are large but stable; they only change when KERNEL_PKG_VERSION changes.
|
|
||||||
if [ -d /var/tmp/bee-iso-work ]; then
|
|
||||||
find /var/tmp/bee-iso-work -maxdepth 1 -mindepth 1 \
|
|
||||||
-not -name 'kernel_*' \
|
|
||||||
-not -name 'syslinux_*' -not -name 'grub_*' \
|
|
||||||
-exec rm -rf {} + 2>/dev/null || true
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Run from /var/tmp: mkimage.sh calls git internally; running from inside /root/bee causes
|
# --- build / download NCCL ---
|
||||||
# "outside repository" errors. /var/tmp is outside the git repo and has enough scratch space.
|
|
||||||
# genapkovl-bee.sh is found by mkimage via ~/.mkimage/.
|
|
||||||
# Remove any stale genapkovl from /var/tmp — mkimage checks CWD first, stale files override ~/.mkimage/.
|
|
||||||
rm -f /var/tmp/genapkovl-*.sh
|
|
||||||
export TMPDIR=/var/tmp
|
|
||||||
cd /var/tmp
|
|
||||||
sh /usr/share/aports/scripts/mkimage.sh \
|
|
||||||
--tag "v${ALPINE_VERSION}" \
|
|
||||||
--outdir "${DIST_DIR}" \
|
|
||||||
--arch x86_64 \
|
|
||||||
--repository "https://dl-cdn.alpinelinux.org/alpine/v${ALPINE_VERSION}/main" \
|
|
||||||
--repository "https://dl-cdn.alpinelinux.org/alpine/v${ALPINE_VERSION}/community" \
|
|
||||||
--workdir /var/tmp/bee-iso-work \
|
|
||||||
--profile bee
|
|
||||||
|
|
||||||
ISO="${DIST_DIR}/alpine-bee-${ALPINE_VERSION}-x86_64.iso"
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== done ==="
|
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||||
echo "ISO: $ISO"
|
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||||
echo "Size: $(du -sh "$ISO" 2>/dev/null | cut -f1 || echo 'not found')"
|
|
||||||
|
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
|
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||||
|
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
|
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||||
|
|
||||||
|
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||||
|
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
|
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||||
|
|
||||||
|
# --- build nccl-tests ---
|
||||||
|
echo ""
|
||||||
|
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||||
|
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||||
|
"${NCCL_TESTS_VERSION}" \
|
||||||
|
"${NCCL_VERSION}" \
|
||||||
|
"${NCCL_CUDA_VERSION}" \
|
||||||
|
"${DIST_DIR}"
|
||||||
|
|
||||||
|
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||||
|
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
echo "=== all_reduce_perf injected ==="
|
||||||
|
|
||||||
|
# --- embed build metadata ---
|
||||||
|
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||||
|
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||||
|
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
|
||||||
|
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||||
|
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||||
|
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||||
|
BUILD_DATE=${BUILD_DATE}
|
||||||
|
GIT_COMMIT=${GIT_COMMIT}
|
||||||
|
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||||
|
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||||
|
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||||
|
NCCL_VERSION=${NCCL_VERSION}
|
||||||
|
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||||
|
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||||
|
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||||
|
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Patch motd with build info
|
||||||
|
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||||
|
if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
|
||||||
|
sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \
|
||||||
|
> "${OVERLAY_STAGE_DIR}/etc/motd.patched"
|
||||||
|
mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- sync overlay into live-build includes.chroot ---
|
||||||
|
LB_DIR="${BUILD_WORK_DIR}"
|
||||||
|
LB_INCLUDES="${LB_DIR}/config/includes.chroot"
|
||||||
|
mkdir -p "${LB_INCLUDES}"
|
||||||
|
rsync -a "${OVERLAY_STAGE_DIR}/" "${LB_INCLUDES}/"
|
||||||
|
|
||||||
|
# Ensure SSH authorized_keys perms are correct (rsync may alter)
|
||||||
|
if [ -f "${LB_INCLUDES}/root/.ssh/authorized_keys" ]; then
|
||||||
|
chmod 700 "${LB_INCLUDES}/root/.ssh"
|
||||||
|
chmod 600 "${LB_INCLUDES}/root/.ssh/authorized_keys"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- build ISO using live-build ---
|
||||||
|
echo ""
|
||||||
|
echo "=== building ISO (live-build) ==="
|
||||||
|
|
||||||
|
cd "${LB_DIR}"
|
||||||
|
lb clean 2>&1 | tail -3
|
||||||
|
lb config 2>&1 | tail -5
|
||||||
|
lb build 2>&1
|
||||||
|
|
||||||
|
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
|
||||||
|
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||||
|
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
||||||
|
if [ -f "$ISO_RAW" ]; then
|
||||||
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
|
echo ""
|
||||||
|
echo "=== done ==="
|
||||||
|
echo "ISO: $ISO_OUT"
|
||||||
|
if command -v stat >/dev/null 2>&1; then
|
||||||
|
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||||
|
else
|
||||||
|
ISO_SIZE_BYTES="$(wc -c < "$ISO_OUT" | tr -d ' ')"
|
||||||
|
fi
|
||||||
|
if command -v numfmt >/dev/null 2>&1; then
|
||||||
|
echo "Size: $(numfmt --to=iec --suffix=B "$ISO_SIZE_BYTES")"
|
||||||
|
else
|
||||||
|
echo "Size: ${ISO_SIZE_BYTES} bytes"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "ERROR: ISO not found at $ISO_RAW"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "Boot via BMC virtual media and SSH to the server IP on port 22 as root."
|
echo "Boot via BMC virtual media and SSH to the server IP on port 22 as root."
|
||||||
|
|||||||
31
iso/builder/config/bootloaders/grub-pc/config.cfg
Normal file
31
iso/builder/config/bootloaders/grub-pc/config.cfg
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
set default=0
|
||||||
|
set timeout=5
|
||||||
|
|
||||||
|
if [ x$feature_default_font_path = xy ] ; then
|
||||||
|
font=unicode
|
||||||
|
else
|
||||||
|
font=$prefix/unicode.pf2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if loadfont $font ; then
|
||||||
|
set gfxmode=1920x1080,1280x1024,auto
|
||||||
|
set gfxpayload=keep
|
||||||
|
insmod efi_gop
|
||||||
|
insmod efi_uga
|
||||||
|
insmod video_bochs
|
||||||
|
insmod video_cirrus
|
||||||
|
else
|
||||||
|
set gfxmode=auto
|
||||||
|
insmod all_video
|
||||||
|
fi
|
||||||
|
|
||||||
|
insmod serial
|
||||||
|
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
||||||
|
|
||||||
|
insmod gfxterm
|
||||||
|
insmod png
|
||||||
|
|
||||||
|
source /boot/grub/theme.cfg
|
||||||
|
|
||||||
|
terminal_input console serial
|
||||||
|
terminal_output gfxterm serial
|
||||||
36
iso/builder/config/bootloaders/grub-pc/grub.cfg
Normal file
36
iso/builder/config/bootloaders/grub-pc/grub.cfg
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||||
|
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||||
|
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||||
|
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||||
|
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||||
|
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
menuentry "EASY-BEE" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE (load to RAM)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.nvidia.mode=normal
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE (fail-safe)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "UEFI Firmware Settings" {
|
||||||
|
fwsetup
|
||||||
|
}
|
||||||
|
fi
|
||||||
51
iso/builder/config/bootloaders/grub-pc/live-theme/theme.txt
Normal file
51
iso/builder/config/bootloaders/grub-pc/live-theme/theme.txt
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
desktop-color: "#000000"
|
||||||
|
title-color: "#f5a800"
|
||||||
|
title-font: "Unifont Regular 16"
|
||||||
|
title-text: ""
|
||||||
|
message-font: "Unifont Regular 16"
|
||||||
|
terminal-font: "Unifont Regular 16"
|
||||||
|
|
||||||
|
#help bar at the bottom
|
||||||
|
+ label {
|
||||||
|
top = 100%-50
|
||||||
|
left = 0
|
||||||
|
width = 100%
|
||||||
|
height = 20
|
||||||
|
text = "@KEYMAP_SHORT@"
|
||||||
|
align = "center"
|
||||||
|
color = "#5a4800"
|
||||||
|
font = "Unifont Regular 16"
|
||||||
|
}
|
||||||
|
|
||||||
|
#boot menu
|
||||||
|
+ boot_menu {
|
||||||
|
left = 20%
|
||||||
|
width = 60%
|
||||||
|
top = 62%
|
||||||
|
height = 38%-80
|
||||||
|
item_color = "#c88000"
|
||||||
|
item_font = "Unifont Regular 16"
|
||||||
|
selected_item_color= "#f5a800"
|
||||||
|
selected_item_font = "Unifont Regular 16"
|
||||||
|
item_height = 16
|
||||||
|
item_padding = 0
|
||||||
|
item_spacing = 4
|
||||||
|
icon_width = 0
|
||||||
|
icon_heigh = 0
|
||||||
|
item_icon_space = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
#progress bar
|
||||||
|
+ progress_bar {
|
||||||
|
id = "__timeout__"
|
||||||
|
left = 20%
|
||||||
|
top = 100%-80
|
||||||
|
height = 14
|
||||||
|
width = 60%
|
||||||
|
font = "Unifont Regular 16"
|
||||||
|
text_color = "#0a0a00"
|
||||||
|
fg_color = "#f5a800"
|
||||||
|
bg_color = "#2a2200"
|
||||||
|
border_color = "#5a4800"
|
||||||
|
text = "@TIMEOUT_NOTIFICATION_LONG@"
|
||||||
|
}
|
||||||
9
iso/builder/config/bootloaders/grub-pc/theme.cfg
Normal file
9
iso/builder/config/bootloaders/grub-pc/theme.cfg
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
set color_normal=light-gray/black
|
||||||
|
set color_highlight=white/dark-gray
|
||||||
|
|
||||||
|
if [ -e /boot/grub/splash.png ]; then
|
||||||
|
set theme=/boot/grub/live-theme/theme.txt
|
||||||
|
else
|
||||||
|
set menu_color_normal=cyan/black
|
||||||
|
set menu_color_highlight=white/dark-gray
|
||||||
|
fi
|
||||||
24
iso/builder/config/bootloaders/isolinux/live.cfg.in
Normal file
24
iso/builder/config/bootloaders/isolinux/live.cfg.in
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
label live-@FLAVOUR@-normal
|
||||||
|
menu label ^EASY-BEE
|
||||||
|
menu default
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-toram
|
||||||
|
menu label EASY-BEE (^load to RAM)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ toram bee.nvidia.mode=normal
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-gsp-off
|
||||||
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-failsafe
|
||||||
|
menu label EASY-BEE (^fail-safe)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||||
56
iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
Executable file
56
iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
Executable file
@@ -0,0 +1,56 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9000-bee-setup.hook.chroot — runs inside Debian chroot during live-build
|
||||||
|
# Enables bee systemd services and configures the live environment.
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "=== bee chroot setup ==="
|
||||||
|
|
||||||
|
ensure_bee_console_user() {
|
||||||
|
if id bee >/dev/null 2>&1; then
|
||||||
|
usermod -d /home/bee -s /bin/sh bee 2>/dev/null || true
|
||||||
|
else
|
||||||
|
useradd -d /home/bee -m -s /bin/sh -U bee
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p /home/bee
|
||||||
|
chown -R bee:bee /home/bee
|
||||||
|
echo "bee:eeb" | chpasswd
|
||||||
|
usermod -aG sudo,video,input bee 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_bee_console_user
|
||||||
|
|
||||||
|
# Enable bee services
|
||||||
|
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||||
|
systemctl enable bee-network.service
|
||||||
|
systemctl enable bee-nvidia.service
|
||||||
|
systemctl enable bee-preflight.service
|
||||||
|
systemctl enable bee-audit.service
|
||||||
|
systemctl enable bee-web.service
|
||||||
|
systemctl enable bee-sshsetup.service
|
||||||
|
systemctl enable ssh.service
|
||||||
|
systemctl enable lightdm.service 2>/dev/null || true
|
||||||
|
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
||||||
|
systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
|
||||||
|
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
|
||||||
|
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||||
|
|
||||||
|
# Ensure scripts are executable
|
||||||
|
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
|
|
||||||
|
# Reload udev rules
|
||||||
|
udevadm control --reload-rules 2>/dev/null || true
|
||||||
|
|
||||||
|
# Create export directory
|
||||||
|
mkdir -p /appdata/bee/export
|
||||||
|
|
||||||
|
if [ -f /etc/sudoers.d/bee ]; then
|
||||||
|
chmod 0440 /etc/sudoers.d/bee
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== bee chroot setup complete ==="
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user