Compare commits

...

10 Commits
v9.1 ... v9.8

Author SHA1 Message Date
Mikhail Chusavitin
8a21809ade Update chart submodule to v2.0 (hardware contract 2.10)
New in chart:
- event_logs and platform_config sections in viewer
- Storage columns: logical_block_size_bytes, physical_block_size_bytes,
  metadata_bytes_per_block
- Compact status/severity icons, severity filtering for event logs
- Fixed JS MIME type and base stylesheet

bee audit schema already has all required fields; no schema changes needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 15:52:30 +03:00
Mikhail Chusavitin
626763e31d Fix GRUB bitmap error: switch from PNG to TGA for splash logo
GRUB's PNG reader (grub2 bookworm) fails to load bee-logo.png despite the
file being valid RGB 8-bit non-interlaced PNG with minimal chunks. Root
cause is a known fragility in GRUB's png.c; exact trigger is unknown.

Switch to uncompressed 24-bit TGA which bypasses the PNG parser entirely.
tga.mod is already present in the ISO (x86_64-efi/tga.mod).

- Convert bee-logo.png → bee-logo.tga (480018 bytes, BGR top-left)
- config.cfg: insmod png → insmod tga
- theme.txt: bee-logo.png → bee-logo.tga
- Document all prior failed attempts in git-bible/grub-bitmap-error.md

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 15:46:13 +03:00
Mikhail Chusavitin
0b8a2ff83f Add validate test matrix and GPU test methodology docs
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 10:47:08 +03:00
Mikhail Chusavitin
2c22b01fe3 Fix IPMI hangs, add VROC license, fix blackbox service, drop qrencode
IPMI hang fix (Lenovo XCC SR650 V3):
- Add pluggable ipmi_profile system with per-vendor timeouts and fruEarlyExit flag
- Lenovo profile: 90s FRU timeout, streaming early-exit stops after PSU blocks found
- collectFRUEarlyExit streams ipmitool fru print and kills process once PSU blocks
  are followed by a non-PSU header (~6s instead of ~108s on 54-device FRU list)
- collectBMCFirmware and collectPSUs accept manufacturer and apply profile timeouts

VROC license detection:
- Detect VMD/VROC controller in PCIe list, run mdadm --detail-platform
- Parse "License:" line; store as snap.VROCLicense in HardwareSnapshot

Blackbox service fix:
- bee-blackbox.service was missing from systemctl enable list in ISO build hook
- Service never started on boot; state file never written; UI button stayed "Enable"

Drop qrencode:
- Remove from package list, standardTools API check, and runtime-flows doc

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 10:46:59 +03:00
Mikhail Chusavitin
ec89616585 Add storage block geometry to audit and viewer 2026-04-29 17:39:11 +03:00
Mikhail Chusavitin
c0dbbf96ad Add vendor RAID tools for livecd 2026-04-29 17:31:25 +03:00
Mikhail Chusavitin
76484b123c Fix fast-path: treat bootloader config changes as heavy
config/bootloaders was missing from the needs_full_build heavy-file
list, so changes to GRUB theme assets (e.g. bee-logo.png RGBA→RGB fix
in 333c44f) were silently skipped by the squashfs-surgery fast-path.
The old broken PNG stayed in boot/grub/live-theme/ inside the ISO.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 15:36:29 +03:00
Mikhail Chusavitin
8901596152 Add server diagnostic tools to ISO, drop btop
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 13:18:50 +03:00
Mikhail Chusavitin
7c504e5056 Collect IOMMU group per PCIe device from sysfs
Reads the iommu_group symlink for each BDF and exposes the group number
as iommu_group in the hardware snapshot JSON.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 12:34:54 +03:00
Mikhail Chusavitin
333c44f3ba Fix GRUB splash: convert bee-logo.png from RGBA to RGB
GRUB does not support RGBA PNG (color_type=6) — loading it returns a
null bitmap, triggering "null src bitmap in grub_video_bitmap_create_scaled".
Alpha channel composited onto black background (#000000 matches desktop-color).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 11:15:16 +03:00
36 changed files with 1094 additions and 149 deletions

View File

@@ -3,6 +3,7 @@ package collector
import (
"bee/audit/internal/schema"
"bufio"
"context"
"log/slog"
"os"
"os/exec"
@@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) {
return string(out), nil
}
var execIpmitool = func(args ...string) (string, error) {
out, err := exec.Command("ipmitool", args...).Output()
if err != nil {
return "", err
}
return string(out), nil
}
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
@@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord {
if _, err := exec.LookPath("ipmitool"); err != nil {
return nil
}
if _, err := os.Stat("/dev/ipmi0"); err != nil {
return nil
}
out, err := execIpmitool("mc", "info")
profile := selectIPMIProfile(manufacturer)
ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info")
raw, err := cmd.Output()
if err != nil {
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
return nil
}
version := parseBMCFirmwareRevision(out)
version := parseBMCFirmwareRevision(string(raw))
if version == "" {
return nil
}

View File

@@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
board, biosFW := collectBoard()
snap.Board = board
snap.Firmware = append(snap.Firmware, biosFW...)
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)
snap.CPUs = collectCPUs()
@@ -34,6 +34,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
}
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
bestEffortRescanHotplugStorage()
snap.Storage = collectStorage()
snap.PCIeDevices = collectPCIe()
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
@@ -44,7 +45,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
snap.PowerSupplies = collectPSUs()
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
snap.Sensors = buildSensorsFromDoc(sensorDoc)
finalizeSnapshot(&snap, collectedAt)

View File

@@ -0,0 +1,92 @@
package collector
// Package-level IPMI tuning profiles.
//
// Each profile is matched by board manufacturer (already known before PSU
// collection runs). The profile drives two things:
// - Per-command timeouts — prevents infinite hangs on slow BMCs.
// - FRU early-exit — streaming parser stops reading once all PSU entries
// are found, avoiding the tail of non-PSU FRU records.
//
// To add a new vendor: append to ipmiProfiles. The first matching entry wins.
import (
"strings"
"time"
)
// ipmiProfile holds tuning parameters for one or more board manufacturers.
type ipmiProfile struct {
// name is shown in log messages.
name string
// manufacturers is a list of lowercase substrings matched against the
// board manufacturer string from dmidecode type 1.
manufacturers []string
// fruTimeout is the hard deadline for the entire `ipmitool fru print`
// command. Zero means no timeout (not recommended).
fruTimeout time.Duration
// sdrTimeout is the hard deadline for `ipmitool sdr`.
sdrTimeout time.Duration
// mcInfoTimeout is the hard deadline for `ipmitool mc info`.
mcInfoTimeout time.Duration
// fruEarlyExit instructs the streaming FRU parser to stop reading
// after it has found at least one PSU entry and the current block is
// complete. Useful on servers with many non-PSU FRU devices.
fruEarlyExit bool
}
// ipmiProfiles is the ordered list of profiles. First match wins.
var ipmiProfiles = []ipmiProfile{
{
// Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series).
// SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the
// full `fru print` scan takes ~108 s on a loaded BMC. Enable early
// exit so collection stops once PSU records are found.
name: "lenovo",
manufacturers: []string{"lenovo"},
fruTimeout: 90 * time.Second,
sdrTimeout: 45 * time.Second,
mcInfoTimeout: 15 * time.Second,
fruEarlyExit: true,
},
{
// HPE iLO-based servers (ProLiant DL/ML/BL).
name: "hpe",
manufacturers: []string{"hp", "hewlett packard"},
fruTimeout: 60 * time.Second,
sdrTimeout: 30 * time.Second,
mcInfoTimeout: 10 * time.Second,
fruEarlyExit: false,
},
{
// Dell iDRAC-based servers.
name: "dell",
manufacturers: []string{"dell"},
fruTimeout: 60 * time.Second,
sdrTimeout: 30 * time.Second,
mcInfoTimeout: 10 * time.Second,
fruEarlyExit: false,
},
}
// defaultIPMIProfile is used when no vendor profile matches.
var defaultIPMIProfile = ipmiProfile{
name: "default",
fruTimeout: 60 * time.Second,
sdrTimeout: 30 * time.Second,
mcInfoTimeout: 10 * time.Second,
fruEarlyExit: false,
}
// selectIPMIProfile returns the profile for the given board manufacturer.
func selectIPMIProfile(manufacturer string) ipmiProfile {
mfgLower := strings.ToLower(strings.TrimSpace(manufacturer))
for _, p := range ipmiProfiles {
for _, m := range p.manufacturers {
if strings.Contains(mfgLower, m) {
return p
}
}
}
return defaultIPMIProfile
}

View File

@@ -4,7 +4,9 @@ import (
"bee/audit/internal/schema"
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
)
@@ -140,6 +142,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
dev.NUMANode = &numaNode
}
if group, ok := readPCIIOMMUGroup(bdf); ok {
dev.IOMMUGroup = &group
}
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
dev.LinkWidth = &width
}
@@ -179,6 +184,21 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
return dev
}
// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
func readPCIIOMMUGroup(bdf string) (int, bool) {
link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
target, err := os.Readlink(link)
if err != nil {
return 0, false
}
n, err := strconv.Atoi(filepath.Base(target))
if err != nil {
return 0, false
}
return n, true
}
// readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
func readPCIIDs(bdf string) (vendorID, deviceID int) {
base := "/sys/bus/pci/devices/" + bdf

View File

@@ -2,6 +2,8 @@ package collector
import (
"bee/audit/internal/schema"
"bufio"
"context"
"log/slog"
"os/exec"
"regexp"
@@ -10,16 +12,29 @@ import (
"strings"
)
func collectPSUs() []schema.HardwarePowerSupply {
func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
profile := selectIPMIProfile(manufacturer)
var psus []schema.HardwarePowerSupply
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
psus = parseFRU(string(out))
fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
defer fruCancel()
if profile.fruEarlyExit {
psus = collectFRUEarlyExit(fruCtx)
} else {
slog.Info("psu: fru unavailable", "err", err)
cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
if out, err := cmd.Output(); err == nil {
psus = parseFRU(string(out))
} else {
slog.Info("psu: fru unavailable", "err", err)
}
}
sdrData := map[int]psuSDR{}
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
defer sdrCancel()
cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
if sdrOut, err := cmd.Output(); err == nil {
sdrData = parsePSUSDR(string(sdrOut))
if len(psus) == 0 {
psus = synthesizePSUsFromSDR(sdrData)
@@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply {
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
return nil
}
slog.Info("psu: collected", "count", len(psus))
slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
return psus
}
// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
// as soon as it has found all PSU blocks and the next block is not a PSU.
// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
pipe, err := cmd.StdoutPipe()
if err != nil {
slog.Info("psu: fru pipe unavailable", "err", err)
return nil
}
if err := cmd.Start(); err != nil {
slog.Info("psu: fru start failed", "err", err)
return nil
}
var psus []schema.HardwarePowerSupply
var currentBlock strings.Builder
slot := 0
psuFound := false
stoppedEarly := false
scanner := bufio.NewScanner(pipe)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "FRU Device Description") {
if currentBlock.Len() > 0 {
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
psus = append(psus, psu)
psuFound = true
slot++
}
currentBlock.Reset()
}
// Stop once we've collected PSUs and hit a non-PSU block header.
if psuFound && !isPSUHeader(strings.ToLower(line)) {
stoppedEarly = true
break
}
}
currentBlock.WriteString(line)
currentBlock.WriteByte('\n')
}
if !stoppedEarly && currentBlock.Len() > 0 {
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
psus = append(psus, psu)
}
}
// Kill the process immediately on early exit rather than waiting for context timeout.
if cmd.Process != nil {
cmd.Process.Kill() //nolint:errcheck
}
cmd.Wait() //nolint:errcheck
slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
return psus
}

View File

@@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray {
return arrays
}
// collectVROCLicense runs mdadm --detail-platform and extracts the License field.
// Returns nil when VROC is absent or the platform does not report a license.
func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string {
if !hasVROCController(pcie) {
return nil
}
out, err := raidToolQuery("mdadm", "--detail-platform")
if err != nil {
slog.Info("vroc: mdadm --detail-platform unavailable", "err", err)
return nil
}
return parseMDAdmPlatformLicense(string(out))
}
func parseMDAdmPlatformLicense(raw string) *string {
for _, line := range strings.Split(raw, "\n") {
trimmed := strings.TrimSpace(line)
if !strings.HasPrefix(strings.ToLower(trimmed), "license") {
continue
}
if idx := strings.Index(trimmed, ":"); idx >= 0 {
val := strings.TrimSpace(trimmed[idx+1:])
if val != "" {
v := strings.ToLower(val)
return &v
}
}
}
return nil
}
func queryDeviceSerial(devPath string) string {
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
var ctrl nvmeIDCtrl

View File

@@ -4,12 +4,52 @@ import (
"bee/audit/internal/schema"
"encoding/json"
"log/slog"
"os"
"os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"
)
var (
pciRescanPath = "/sys/bus/pci/rescan"
scsiHostScanGlob = "/sys/class/scsi_host/host*/scan"
hotplugWriteFile = os.WriteFile
hotplugExecCommand = exec.Command
hotplugGlob = filepath.Glob
nvmeLBAFCompactRE = regexp.MustCompile(`(?im)^\s*lbaf\s+\d+\s*:\s*ms:(\d+)\s+lbads:(\d+).*?\(in use\)\s*$`)
nvmeLBAFVerboseRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+\d+\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*?\(in use\)\s*$`)
sgReadcapBlockRE = regexp.MustCompile(`(?im)logical block length\s*=\s*(\d+)\s+bytes`)
sgReadcapProtRE = regexp.MustCompile(`(?im)prot_en\s*=\s*1`)
)
func bestEffortRescanHotplugStorage() {
if err := hotplugWriteFile(pciRescanPath, []byte("1\n"), 0644); err != nil {
slog.Info("storage: pci rescan skipped", "path", pciRescanPath, "err", err)
} else {
slog.Info("storage: triggered pci rescan for hotplug discovery")
}
hostPaths, err := hotplugGlob(scsiHostScanGlob)
if err != nil {
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
} else {
for _, path := range hostPaths {
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
continue
}
slog.Info("storage: triggered scsi host scan", "path", path)
}
}
out, err := hotplugExecCommand("udevadm", "settle", "--timeout=10").CombinedOutput()
if err != nil {
slog.Info("storage: udev settle after hotplug rescan failed", "err", err, "output", strings.TrimSpace(string(out)))
}
}
func collectStorage() []schema.HardwareStorage {
devs := discoverStorageDevices()
result := make([]schema.HardwareStorage, 0, len(devs))
@@ -35,6 +75,8 @@ type lsblkDevice struct {
Model string `json:"model"`
Tran string `json:"tran"`
Hctl string `json:"hctl"`
LogSec string `json:"log-sec"`
PhySec string `json:"phy-sec"`
}
type lsblkRoot struct {
@@ -101,7 +143,7 @@ func isVirtualHDiskModel(model string) bool {
func lsblkDevices() []lsblkDevice {
out, err := exec.Command("lsblk", "-J", "-d",
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL,LOG-SEC,PHY-SEC").Output()
if err != nil {
slog.Warn("storage: lsblk failed", "err", err)
return nil
@@ -208,6 +250,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
present := true
s := schema.HardwareStorage{Present: &present}
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
applyStorageBlockGeometry(&s, dev)
tran := strings.ToLower(dev.Tran)
devPath := "/dev/" + dev.Name
@@ -327,6 +370,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
lifeRemainingPct: lifeRemaining,
}
applySCSISmartctlTelemetry(&s, raw, &status)
applySCSIProtectionBlockGeometry(&s, devPath)
setStorageHealthStatus(&s, status)
return s
}
@@ -374,6 +418,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
Interface: &iface,
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
}
applyStorageBlockGeometry(&s, dev)
devPath := "/dev/" + dev.Name
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
@@ -408,6 +453,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
}
}
}
applyNVMeBlockGeometry(&s, devPath)
// smart-log: wear telemetry
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
@@ -540,6 +586,19 @@ func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, s
"path:user_capacity.block_size",
)
if hasBlockSize && blockSize > 0 {
if s.LogicalBlockSizeBytes == nil {
s.LogicalBlockSizeBytes = &blockSize
}
if s.MetadataBytesPerBlock == nil {
zero := int64(0)
s.MetadataBytesPerBlock = &zero
}
if s.Telemetry == nil {
s.Telemetry = map[string]any{}
}
s.Telemetry["logical_block_size_bytes"] = *s.LogicalBlockSizeBytes
s.Telemetry["metadata_bytes_per_block"] = *s.MetadataBytesPerBlock
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
if v, ok := firstInt64(raw,
"path:logical_blocks_written",
"path:total_lbas_written",
@@ -557,6 +616,117 @@ func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, s
}
}
func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
if s == nil {
return
}
logical := parseStorageBytes(dev.LogSec)
physical := parseStorageBytes(dev.PhySec)
if logical <= 0 && physical <= 0 {
return
}
if s.Telemetry == nil {
s.Telemetry = map[string]any{}
}
if logical > 0 {
s.LogicalBlockSizeBytes = &logical
s.Telemetry["logical_block_size_bytes"] = logical
if s.MetadataBytesPerBlock == nil {
zero := int64(0)
s.MetadataBytesPerBlock = &zero
s.Telemetry["metadata_bytes_per_block"] = zero
}
}
if physical > 0 {
s.PhysicalBlockSizeBytes = &physical
s.Telemetry["physical_block_size_bytes"] = physical
}
if s.LogicalBlockSizeBytes != nil && s.MetadataBytesPerBlock != nil {
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
}
}
func applyNVMeBlockGeometry(s *schema.HardwareStorage, devPath string) {
if s == nil || strings.TrimSpace(devPath) == "" {
return
}
out, err := exec.Command("nvme", "id-ns", devPath, "-H").CombinedOutput()
if err != nil {
return
}
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(string(out))
if !ok {
return
}
setStorageBlockGeometry(s, dataBytes, metadataBytes)
}
func applySCSIProtectionBlockGeometry(s *schema.HardwareStorage, devPath string) {
if s == nil || strings.TrimSpace(devPath) == "" {
return
}
out, err := exec.Command("sg_readcap", "-l", devPath).CombinedOutput()
if err != nil {
return
}
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(string(out))
if !ok {
return
}
setStorageBlockGeometry(s, dataBytes, metadataBytes)
}
func setStorageBlockGeometry(s *schema.HardwareStorage, dataBytes, metadataBytes int64) {
if s == nil || dataBytes <= 0 || metadataBytes < 0 {
return
}
if s.Telemetry == nil {
s.Telemetry = map[string]any{}
}
s.LogicalBlockSizeBytes = &dataBytes
s.MetadataBytesPerBlock = &metadataBytes
s.Telemetry["logical_block_size_bytes"] = dataBytes
s.Telemetry["metadata_bytes_per_block"] = metadataBytes
s.Telemetry["block_format"] = formatBlockFormat(dataBytes, metadataBytes)
}
func formatBlockFormat(dataBytes, metadataBytes int64) string {
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
}
func parseNVMeBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
if m := nvmeLBAFCompactRE.FindStringSubmatch(raw); len(m) == 3 {
ms, errMS := strconv.ParseInt(m[1], 10, 64)
lbads, errLBADS := strconv.ParseInt(m[2], 10, 64)
if errMS == nil && errLBADS == nil && lbads >= 0 && lbads < 63 {
return 1 << lbads, ms, true
}
}
if m := nvmeLBAFVerboseRE.FindStringSubmatch(raw); len(m) == 3 {
ms, errMS := strconv.ParseInt(m[1], 10, 64)
ds, errDS := strconv.ParseInt(m[2], 10, 64)
if errMS == nil && errDS == nil && ds > 0 {
return ds, ms, true
}
}
return 0, 0, false
}
func parseSCSIBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
m := sgReadcapBlockRE.FindStringSubmatch(raw)
if len(m) != 2 {
return 0, 0, false
}
blockBytes, err := strconv.ParseInt(m[1], 10, 64)
if err != nil || blockBytes <= 0 {
return 0, 0, false
}
if sgReadcapProtRE.MatchString(raw) {
return blockBytes, 8, true
}
return blockBytes, 0, true
}
func firstInt64(root map[string]any, candidates ...string) (int64, bool) {
for _, candidate := range candidates {
if !strings.HasPrefix(candidate, "path:") {

View File

@@ -0,0 +1,69 @@
package collector
import "testing"
func TestParseNVMeBlockFormatCompact(t *testing.T) {
t.Parallel()
raw := `
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
lbaf 1 : ms:8 lbads:9 rp:0x1
`
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
if !ok {
t.Fatal("parseNVMeBlockFormat returned ok=false")
}
if dataBytes != 512 || metadataBytes != 0 {
t.Fatalf("got %d+%d want 512+0", dataBytes, metadataBytes)
}
}
func TestParseNVMeBlockFormatVerbose(t *testing.T) {
t.Parallel()
raw := `
LBA Format 0 : Metadata Size: 8 bytes - Data Size: 512 bytes - Relative Performance: 0 Better (in use)
LBA Format 1 : Metadata Size: 0 bytes - Data Size: 4096 bytes - Relative Performance: 1 Best
`
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
if !ok {
t.Fatal("parseNVMeBlockFormat returned ok=false")
}
if dataBytes != 512 || metadataBytes != 8 {
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
}
}
func TestParseSCSIBlockFormatWithProtection(t *testing.T) {
t.Parallel()
raw := `
Read Capacity results:
Protection: prot_en=1, p_type=1, p_i_exponent=0
Logical block length=512 bytes
`
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
if !ok {
t.Fatal("parseSCSIBlockFormat returned ok=false")
}
if dataBytes != 512 || metadataBytes != 8 {
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
}
}
func TestParseSCSIBlockFormatWithoutProtection(t *testing.T) {
t.Parallel()
raw := `
Read Capacity results:
Protection: prot_en=0, p_type=0, p_i_exponent=0
Logical block length=4096 bytes
`
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
if !ok {
t.Fatal("parseSCSIBlockFormat returned ok=false")
}
if dataBytes != 4096 || metadataBytes != 0 {
t.Fatalf("got %d+%d want 4096+0", dataBytes, metadataBytes)
}
}

View File

@@ -1,6 +1,12 @@
package collector
import "testing"
import (
"os"
"os/exec"
"path/filepath"
"strings"
"testing"
)
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
t.Parallel()
@@ -31,3 +37,82 @@ func TestParseStorageBytes(t *testing.T) {
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
}
}
func TestBestEffortRescanHotplugStorage(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
rescanPath := filepath.Join(tmp, "pci-rescan")
scanDir := filepath.Join(tmp, "scsi_host")
host0Path := filepath.Join(scanDir, "host0", "scan")
host1Path := filepath.Join(scanDir, "host1", "scan")
argsPath := filepath.Join(tmp, "udevadm-args")
toolPath := filepath.Join(tmp, "udevadm")
if err := os.MkdirAll(filepath.Dir(host0Path), 0755); err != nil {
t.Fatalf("mkdir host0: %v", err)
}
if err := os.MkdirAll(filepath.Dir(host1Path), 0755); err != nil {
t.Fatalf("mkdir host1: %v", err)
}
if err := os.WriteFile(host0Path, nil, 0644); err != nil {
t.Fatalf("touch host0 scan: %v", err)
}
if err := os.WriteFile(host1Path, nil, 0644); err != nil {
t.Fatalf("touch host1 scan: %v", err)
}
script := "#!/bin/sh\nprintf '%s' \"$*\" > \"" + argsPath + "\"\n"
if err := os.WriteFile(toolPath, []byte(script), 0755); err != nil {
t.Fatalf("write udevadm stub: %v", err)
}
oldPath := os.Getenv("PATH")
if err := os.Setenv("PATH", tmp+string(os.PathListSeparator)+oldPath); err != nil {
t.Fatalf("set PATH: %v", err)
}
defer func() { _ = os.Setenv("PATH", oldPath) }()
oldRescanPath := pciRescanPath
oldSCSIGlob := scsiHostScanGlob
oldWriteFile := hotplugWriteFile
oldExecCommand := hotplugExecCommand
oldGlob := hotplugGlob
pciRescanPath = rescanPath
scsiHostScanGlob = filepath.Join(scanDir, "host*", "scan")
hotplugWriteFile = os.WriteFile
hotplugExecCommand = exec.Command
hotplugGlob = filepath.Glob
defer func() {
pciRescanPath = oldRescanPath
scsiHostScanGlob = oldSCSIGlob
hotplugWriteFile = oldWriteFile
hotplugExecCommand = oldExecCommand
hotplugGlob = oldGlob
}()
bestEffortRescanHotplugStorage()
raw, err := os.ReadFile(rescanPath)
if err != nil {
t.Fatalf("read rescan file: %v", err)
}
if string(raw) != "1\n" {
t.Fatalf("rescan payload=%q want %q", string(raw), "1\n")
}
for _, path := range []string{host0Path, host1Path} {
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read scsi scan file %s: %v", path, err)
}
if string(raw) != "- - -\n" {
t.Fatalf("scsi scan payload at %s =%q want %q", path, string(raw), "- - -\n")
}
}
args, err := os.ReadFile(argsPath)
if err != nil {
t.Fatalf("read udevadm args: %v", err)
}
if got := strings.TrimSpace(string(args)); got != "settle --timeout=10" {
t.Fatalf("udevadm args=%q want %q", got, "settle --timeout=10")
}
}

View File

@@ -40,6 +40,12 @@ func TestApplySCSISmartctlTelemetry(t *testing.T) {
if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 {
t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes)
}
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 4096 {
t.Fatalf("logical_block_size_bytes=%v want 4096", disk.LogicalBlockSizeBytes)
}
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
}
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 {
t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct)
}
@@ -80,6 +86,12 @@ func TestApplySCSISmartctlTelemetryDoesNotOverwriteExistingValues(t *testing.T)
if *disk.WrittenBytes != 20 {
t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes)
}
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 512 {
t.Fatalf("logical_block_size_bytes=%v want 512", disk.LogicalBlockSizeBytes)
}
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
}
if *disk.LifeRemainingPct != 30 {
t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct)
}

View File

@@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
}
}
func TestParseMDAdmPlatformLicense(t *testing.T) {
premium := `Platform : Intel(R) Virtual RAID on CPU
Version : 1.3.0.1138
RAID Levels : raid0 raid1 raid5 raid10
Total Disks : 4
License : Premium
`
got := parseMDAdmPlatformLicense(premium)
if got == nil || *got != "premium" {
t.Fatalf("expected 'premium', got %v", got)
}
standard := `Platform : Intel(R) Virtual RAID on CPU
License : Standard
`
got = parseMDAdmPlatformLicense(standard)
if got == nil || *got != "standard" {
t.Fatalf("expected 'standard', got %v", got)
}
noLicense := `Platform : Intel(R) Virtual RAID on CPU
Version : 1.0.0
`
got = parseMDAdmPlatformLicense(noLicense)
if got != nil {
t.Fatalf("expected nil, got %v", *got)
}
}
func TestHasVROCController(t *testing.T) {
intel := vendorIntel
model := "Volume Management Device NVMe RAID Controller"

View File

@@ -66,6 +66,7 @@ type HardwareSnapshot struct {
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
Sensors *HardwareSensors `json:"sensors,omitempty"`
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
VROCLicense *string `json:"vroc_license,omitempty"`
}
type HardwareHealthSummary struct {
@@ -143,30 +144,33 @@ type HardwareMemory struct {
type HardwareStorage struct {
HardwareComponentStatus
Slot *string `json:"slot,omitempty"`
Type *string `json:"type,omitempty"`
Model *string `json:"model,omitempty"`
SizeGB *int `json:"size_gb,omitempty"`
SerialNumber *string `json:"serial_number,omitempty"`
Manufacturer *string `json:"manufacturer,omitempty"`
Firmware *string `json:"firmware,omitempty"`
Interface *string `json:"interface,omitempty"`
Present *bool `json:"present,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"`
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
PowerCycles *int64 `json:"power_cycles,omitempty"`
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
MediaErrors *int64 `json:"media_errors,omitempty"`
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
WrittenBytes *int64 `json:"written_bytes,omitempty"`
ReadBytes *int64 `json:"read_bytes,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
Telemetry map[string]any `json:"-"`
Slot *string `json:"slot,omitempty"`
Type *string `json:"type,omitempty"`
Model *string `json:"model,omitempty"`
SizeGB *int `json:"size_gb,omitempty"`
LogicalBlockSizeBytes *int64 `json:"logical_block_size_bytes,omitempty"`
PhysicalBlockSizeBytes *int64 `json:"physical_block_size_bytes,omitempty"`
MetadataBytesPerBlock *int64 `json:"metadata_bytes_per_block,omitempty"`
SerialNumber *string `json:"serial_number,omitempty"`
Manufacturer *string `json:"manufacturer,omitempty"`
Firmware *string `json:"firmware,omitempty"`
Interface *string `json:"interface,omitempty"`
Present *bool `json:"present,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"`
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
PowerCycles *int64 `json:"power_cycles,omitempty"`
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
MediaErrors *int64 `json:"media_errors,omitempty"`
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
WrittenBytes *int64 `json:"written_bytes,omitempty"`
ReadBytes *int64 `json:"read_bytes,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
Telemetry map[string]any `json:"-"`
}
type HardwarePCIeDevice struct {
@@ -211,6 +215,7 @@ type HardwarePCIeDevice struct {
Firmware *string `json:"firmware,omitempty"`
MacAddresses []string `json:"mac_addresses,omitempty"`
Present *bool `json:"present,omitempty"`
IOMMUGroup *int `json:"iommu_group,omitempty"`
Telemetry map[string]any `json:"-"`
}

View File

@@ -50,6 +50,9 @@ func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
writtenBytes := int64(9876543210)
readBytes := int64(1234567890)
lifeRemainingPct := 91.0
logicalBlockSizeBytes := int64(512)
physicalBlockSizeBytes := int64(4096)
metadataBytesPerBlock := int64(8)
payload := HardwareIngestRequest{
CollectedAt: "2026-03-15T15:00:00Z",
@@ -57,12 +60,15 @@ func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
Board: HardwareBoard{SerialNumber: "SRV-001"},
Storage: []HardwareStorage{
{
SerialNumber: stringPtr("DISK-001"),
Model: stringPtr("TestDisk"),
PowerOnHours: &powerOnHours,
WrittenBytes: &writtenBytes,
ReadBytes: &readBytes,
LifeRemainingPct: &lifeRemainingPct,
SerialNumber: stringPtr("DISK-001"),
Model: stringPtr("TestDisk"),
LogicalBlockSizeBytes: &logicalBlockSizeBytes,
PhysicalBlockSizeBytes: &physicalBlockSizeBytes,
MetadataBytesPerBlock: &metadataBytesPerBlock,
PowerOnHours: &powerOnHours,
WrittenBytes: &writtenBytes,
ReadBytes: &readBytes,
LifeRemainingPct: &lifeRemainingPct,
},
},
},
@@ -75,6 +81,9 @@ func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
text := string(data)
for _, needle := range []string{
`"storage":[{`,
`"logical_block_size_bytes":512`,
`"physical_block_size_bytes":4096`,
`"metadata_bytes_per_block":8`,
`"power_on_hours":12450`,
`"written_bytes":9876543210`,
`"read_bytes":1234567890`,

View File

@@ -1295,7 +1295,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
var standardTools = []string{
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
"mstflint", "qrencode",
"mstflint",
}
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {

View File

@@ -572,6 +572,7 @@ func (h *handler) handleExportIndex(w http.ResponseWriter, r *http.Request) {
func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) {
snapshot, _ := loadSnapshot(h.opts.AuditPath)
snapshot = enrichSnapshotForViewer(snapshot)
body, err := viewer.RenderHTML(snapshot, h.opts.Title)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)

View File

@@ -1016,6 +1016,39 @@ func TestViewerRendersLatestSnapshot(t *testing.T) {
}
}
func TestViewerRendersDerivedStorageBlockFormat(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
body := `{
"collected_at":"2026-04-29T00:05:00Z",
"hardware":{
"board":{"serial_number":"SERIAL-NEW"},
"storage":[
{
"serial_number":"DISK-1",
"model":"Test NVMe",
"logical_block_size_bytes":512,
"physical_block_size_bytes":4096,
"metadata_bytes_per_block":8
}
]
}
}`
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{AuditPath: path})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/viewer", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
if !strings.Contains(rec.Body.String(), "512&#43;8") {
t.Fatalf("viewer body missing derived block format: %s", rec.Body.String())
}
}
func TestAuditJSONServesLatestSnapshot(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
@@ -1038,6 +1071,36 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
}
}
func TestAuditJSONDoesNotInjectDerivedStorageBlockFormat(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
body := `{
"hardware":{
"board":{"serial_number":"SERIAL-API"},
"storage":[
{
"serial_number":"DISK-1",
"logical_block_size_bytes":512,
"metadata_bytes_per_block":8
}
]
}
}`
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{AuditPath: path})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit.json", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
if strings.Contains(rec.Body.String(), "block_format") {
t.Fatalf("audit.json should remain contract-only: %s", rec.Body.String())
}
}
func TestMissingAuditJSONReturnsNotFound(t *testing.T) {
handler := NewHandler(HandlerOptions{AuditPath: "/missing/audit.json"})
rec := httptest.NewRecorder()

View File

@@ -0,0 +1,62 @@
package webui
import (
"encoding/json"
"strconv"
)
func enrichSnapshotForViewer(snapshot []byte) []byte {
if len(snapshot) == 0 {
return snapshot
}
var root map[string]any
if err := json.Unmarshal(snapshot, &root); err != nil {
return snapshot
}
hardware, _ := root["hardware"].(map[string]any)
if len(hardware) == 0 {
return snapshot
}
storage, _ := hardware["storage"].([]any)
if len(storage) == 0 {
return snapshot
}
changed := false
for _, item := range storage {
row, _ := item.(map[string]any)
if len(row) == 0 {
continue
}
if _, exists := row["block_format"]; exists {
continue
}
logical, okLogical := jsonNumberToInt64(row["logical_block_size_bytes"])
metadata, okMetadata := jsonNumberToInt64(row["metadata_bytes_per_block"])
if !okLogical || !okMetadata || logical <= 0 || metadata < 0 {
continue
}
row["block_format"] = strconv.FormatInt(logical, 10) + "+" + strconv.FormatInt(metadata, 10)
changed = true
}
if !changed {
return snapshot
}
out, err := json.Marshal(root)
if err != nil {
return snapshot
}
return out
}
func jsonNumberToInt64(v any) (int64, bool) {
switch x := v.(type) {
case float64:
return int64(x), true
case int64:
return x, true
case int:
return int64(x), true
default:
return 0, false
}
}

View File

@@ -9,5 +9,62 @@ Generic engineering rules live in `bible/rules/patterns/`.
|---|---|
| `architecture/system-overview.md` | What bee does, scope, tech stack |
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
| `docs/customer-gpu-test-methodology.md` | Customer-facing GPU PCIe Validate / Validate -> Stress test list |
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
| `decisions/` | Architectural decision log, including read-only submodule policy |
## Validate Test Matrix
### Validate
- CPU check
- `lscpu`
- `sensors`
- `stress-ng`
- Memory check
- `free`
- `timeout <timeout_sec> memtester`
- `free`
- NVMe storage check
- `nvme id-ctrl`
- `nvme smart-log`
- `nvme device-self-test`
- SATA/SAS storage check
- `smartctl -H -A`
- `smartctl -t short`
- Basic NVIDIA GPU check
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dmidecode -t baseboard`
- `dmidecode -t system`
- `dcgmi diag -r 2`
- Inter-GPU communication check
- `all_reduce_perf`
- GPU bandwidth check
- `dcgmi diag -r nvbandwidth`
### Validate -> Stress
- Extended NVIDIA GPU check
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dmidecode -t baseboard`
- `dmidecode -t system`
- `dcgmi diag -r 3`
- NVIDIA targeted stress
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r targeted_stress`
- NVIDIA targeted power
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r targeted_power`
- NVIDIA pulse test
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r pulse_test`
- Inter-GPU communication check
- `all_reduce_perf`
- GPU bandwidth check
- `dcgmi diag -r nvbandwidth`

View File

@@ -149,7 +149,6 @@ Current validation state:
6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
8. output JSON → /var/log/bee-audit.json
9. QR summary to stdout (qrencode if available)
```
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.

View File

@@ -0,0 +1,54 @@
# GPU PCIe Test Methodology
## Validate
- CPU check
- `lscpu`
- `sensors`
- `stress-ng`
- Memory check
- `free`
- `timeout <timeout_sec> memtester`
- `free`
- NVMe storage check
- `nvme id-ctrl`
- `nvme smart-log`
- `nvme device-self-test`
- SATA/SAS storage check
- `smartctl -H -A`
- `smartctl -t short`
- Basic NVIDIA GPU check
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dmidecode -t baseboard`
- `dmidecode -t system`
- `dcgmi diag -r 2`
- Inter-GPU communication check
- `all_reduce_perf`
- GPU bandwidth check
- `dcgmi diag -r nvbandwidth`
## Validate -> Stress
- Extended NVIDIA GPU check
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dmidecode -t baseboard`
- `dmidecode -t system`
- `dcgmi diag -r 3`
- NVIDIA targeted stress
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r targeted_stress`
- NVIDIA targeted power
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r targeted_power`
- NVIDIA pulse test
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r pulse_test`
- Inter-GPU communication check
- `all_reduce_perf`
- GPU bandwidth check
- `dcgmi diag -r nvbandwidth`

View File

@@ -1,7 +1,7 @@
---
title: Hardware Ingest JSON Contract
version: "2.7"
updated: "2026-03-15"
version: "2.10"
updated: "2026-04-29"
maintainer: Reanimator Core
audience: external-integrators, ai-agents
language: ru
@@ -9,7 +9,7 @@ language: ru
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
Версия: **2.7** · Дата: **2026-03-15**
Версия: **2.10** · Дата: **2026-04-29**
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
@@ -22,6 +22,9 @@ language: ru
| Версия | Дата | Изменения |
|--------|------|-----------|
| 2.10 | 2026-04-29 | Для `hardware.storage[]` добавлены необязательные числовые поля `logical_block_size_bytes`, `physical_block_size_bytes`, `metadata_bytes_per_block` для нормализованного описания формата блока накопителя |
| 2.9 | 2026-03-19 | Добавлена необязательная секция `hardware.platform_config` — произвольный объект с настройками платформы (BIOS/Redfish); хранится как latest-snapshot per machine |
| 2.8 | 2026-03-15 | Поле `location` удалено из всех `sensors.*`; сенсоры передаются только по `name` и измеренным значениям |
| 2.7 | 2026-03-15 | Явно запрещён синтез данных в `event_logs`; интеграторы не должны придумывать серийные номера компонентов, если источник их не отдал |
| 2.6 | 2026-03-15 | Добавлена необязательная секция `event_logs` для dedup/upsert логов `host` / `bmc` / `redfish` вне history timeline |
| 2.5 | 2026-03-15 | Добавлено общее необязательное поле `manufactured_year_week` для компонентных секций (`YYYY-Www`) |
@@ -131,8 +134,9 @@ GET /ingest/hardware/jobs/{job_id}
"storage": [ ... ],
"pcie_devices": [ ... ],
"power_supplies": [ ... ],
"sensors": { ... },
"event_logs": [ ... ]
"sensors": { ... },
"event_logs": [ ... ],
"platform_config": { ... }
}
}
```
@@ -343,6 +347,9 @@ GET /ingest/hardware/jobs/{job_id}
| `type` | string | нет | Тип: `NVMe`, `SSD`, `HDD` |
| `interface` | string | нет | Интерфейс: `NVMe`, `SATA`, `SAS` |
| `size_gb` | int | нет | Размер в ГБ |
| `logical_block_size_bytes` | int64 | нет | Логический размер пользовательского блока данных, например `512` или `4096` |
| `physical_block_size_bytes` | int64 | нет | Физический размер блока, если известен, например `4096` |
| `metadata_bytes_per_block` | int64 | нет | Metadata / protection bytes на логический блок, например `0` или `8` |
| `temperature_c` | float | нет | Температура накопителя, °C (telemetry) |
| `power_on_hours` | int64 | нет | Время работы, часы |
| `power_cycles` | int64 | нет | Количество циклов питания |
@@ -363,6 +370,11 @@ GET /ingest/hardware/jobs/{job_id}
Диск без `serial_number` игнорируется. Изменение `firmware` создаёт событие `FIRMWARE_CHANGED`.
Формат вида `512+8` в контракт не добавляется отдельным строковым полем. Если источник знает такую форму, он должен передавать её как:
- `logical_block_size_bytes = 512`
- `metadata_bytes_per_block = 8`
- `physical_block_size_bytes = 512` или `4096`, если известен физический размер блока
```json
"storage": [
{
@@ -370,6 +382,9 @@ GET /ingest/hardware/jobs/{job_id}
"type": "NVMe",
"model": "INTEL SSDPF2KX076T1",
"size_gb": 7680,
"logical_block_size_bytes": 512,
"physical_block_size_bytes": 4096,
"metadata_bytes_per_block": 8,
"temperature_c": 38.5,
"power_on_hours": 12450,
"unsafe_shutdowns": 3,
@@ -592,7 +607,6 @@ PSU без `serial_number` игнорируется.
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора в рамках секции |
| `location` | string | нет | Физическое расположение |
| `rpm` | int | нет | Обороты, RPM |
| `status` | string | нет | Статус: `OK`, `Warning`, `Critical`, `Unknown` |
@@ -601,7 +615,6 @@ PSU без `serial_number` игнорируется.
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора |
| `location` | string | нет | Физическое расположение |
| `voltage_v` | float | нет | Напряжение, В |
| `current_a` | float | нет | Ток, А |
| `power_w` | float | нет | Мощность, Вт |
@@ -612,7 +625,6 @@ PSU без `serial_number` игнорируется.
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора |
| `location` | string | нет | Физическое расположение |
| `celsius` | float | нет | Температура, °C |
| `threshold_warning_celsius` | float | нет | Порог Warning, °C |
| `threshold_critical_celsius` | float | нет | Порог Critical, °C |
@@ -623,29 +635,29 @@ PSU без `serial_number` игнорируется.
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора |
| `location` | string | нет | Физическое расположение |
| `value` | float | нет | Значение |
| `unit` | string | нет | Единица измерения |
| `status` | string | нет | Статус |
**Правила sensors:**
- Идентификатор сенсора: пара `(sensor_type, name)`. Дубли в одном payload — берётся первое вхождение.
- `location` для сенсоров передавать не нужно и не следует: в Reanimator location/slot используется только для проверки перемещения и установки компонентов, а не для last-known-value sensor ingest.
- Сенсоры без `name` игнорируются.
- При каждом импорте значения перезаписываются (upsert по ключу).
```json
"sensors": {
"fans": [
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" },
{ "name": "FAN_CPU0", "location": "CPU0", "rpm": 5600, "status": "OK" }
{ "name": "FAN1", "rpm": 4200, "status": "OK" },
{ "name": "FAN_CPU0", "rpm": 5600, "status": "OK" }
],
"power": [
{ "name": "12V Rail", "location": "Mainboard", "voltage_v": 12.06, "status": "OK" },
{ "name": "PSU0 Input", "location": "PSU0", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
{ "name": "12V Rail", "voltage_v": 12.06, "status": "OK" },
{ "name": "PSU0 Input", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
],
"temperatures": [
{ "name": "CPU0 Temp", "location": "CPU0", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
{ "name": "Inlet Temp", "location": "Front", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
{ "name": "CPU0 Temp", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
{ "name": "Inlet Temp", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
],
"other": [
{ "name": "System Humidity", "value": 38.5, "unit": "%", "status": "OK" }
@@ -655,6 +667,31 @@ PSU без `serial_number` игнорируется.
---
## Секция platform_config
Необязательный объект с произвольными ключами — настройки платформы как есть из источника (BIOS, Redfish, IPMI).
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `platform_config` | object | нет | Произвольный объект: ключи — строки, значения — строки, числа, булевы |
**Правила platform_config:**
- Содержимое объекта не валидируется: передавайте параметры как есть.
- При каждом импорте хранится latest-snapshot per machine; история изменений по каждому ключу накапливается отдельно.
- Если секция отсутствует или равна `null` — данные платформы не обновляются.
```json
"platform_config": {
"SecureBoot": "Enabled",
"BiosVersion": "06.08.05",
"TpmEnabled": true,
"NumaEnabled": false,
"HyperThreading": "Enabled"
}
```
---
## Обработка статусов компонентов
| Статус | Поведение |
@@ -787,6 +824,12 @@ PSU без `serial_number` игнорируется.
"other": [
{ "name": "System Humidity", "value": 38.5, "unit": "%" }
]
},
"platform_config": {
"SecureBoot": "Enabled",
"BiosVersion": "06.08.05",
"TpmEnabled": true,
"HyperThreading": "Enabled"
}
}
}

View File

@@ -0,0 +1,134 @@
# GRUB bitmap error: null src bitmap in grub_video_bitmap_create_scaled
## Symptom
```
error: null src bitmap in grub_video_bitmap_create_scaled.
Press any key to continue...
```
Appears on boot before the GRUB menu renders. The menu still appears after pressing a key,
but without the bee logo. Reproduced on real hardware (Lenovo SR650 V3, ASUS GPU servers).
## Root cause model
`grub_video_bitmap_create_scaled` receives a null `src` pointer, meaning the PNG loader
returned null for `bee-logo.png`. GRUB calls this function even when no explicit
`width`/`height` are set in `theme.txt` — it is invoked any time an image component is
rendered, passing the image's natural dimensions as the target size.
The PNG file is referenced as `file = "bee-logo.png"` (relative to theme dir).
GRUB resolves this to `/boot/grub/live-theme/bee-logo.png`.
## Attempts that did NOT fix the error
### Attempt 1 — add explicit `width`/`height` to image block (d52ec67)
**What was done:** First introduction of bee-logo.png with:
```
+ image {
top = 4%
left = 50%-200
width = 400
height = 400
file = "bee-logo.png"
}
```
PNG at this point was RGBA (color_type=6).
**Result:** Error appeared immediately on first ISO build.
---
### Attempt 2 — remove `width`/`height` from image block (aa284ae)
**Hypothesis:** Explicit scaling dimensions trigger the scale path; removing them avoids it.
**What was done:** Removed `width = 400` and `height = 400` from the image block.
```
+ image {
top = 4%
left = 50%-200
file = "bee-logo.png"
}
```
**Result:** Error persists. GRUB calls `grub_video_bitmap_create_scaled` regardless of whether
`width`/`height` are specified — if the bitmap is null (loading failed), the error fires either way.
---
### Attempt 3 — convert PNG to RGBA + strip metadata chunks (6112094)
**Hypothesis:** GRUB's minimal PNG parser is confused by metadata chunks (cHRM, bKGD, tIME, tEXt).
Also re-ordered `terminal_output gfxterm` before `insmod png` / theme load.
**What was done:**
- Converted PNG to RGBA color_type=6, stripped all ancillary chunks
- Moved `terminal_output gfxterm` earlier in config.cfg
- Removed echo ASCII art banner from grub.cfg
**Result:** Error persists — and this change actually confirmed RGBA does not work:
GRUB's PNG loader does not render RGBA PNGs correctly on this platform.
---
### Attempt 4 — convert PNG from RGBA back to RGB (333c44f, most recent)
**Hypothesis:** GRUB does not support RGBA (color_type=6); RGB (color_type=2) is the correct format.
Alpha channel composited onto black background (#000000) to match `desktop-color`.
**What was done:** Converted bee-logo.png from RGBA to RGB via ImageMagick.
**Current file state:**
- 400×400 px, 8-bit/color RGB, non-interlaced
- Only IHDR + IDAT + IEND chunks (no metadata)
- `insmod png` is present in config.cfg
- `terminal_output gfxterm` runs before theme is sourced
- No explicit `width`/`height` in image block
**Result:** Error still occurs on real hardware. Despite the PNG being nominally correct
(RGB, non-interlaced, minimal chunks), the bitmap load returns null.
## Confirmed root cause (verified on 172.16.41.94, 2026-04-30)
The EFI partition (`sda2`, vfat, 5 MB) contains only:
```
/EFI/boot/bootia32.efi
/EFI/boot/bootx64.efi
/EFI/boot/grubx64.efi
/boot/grub/grub.cfg
```
`config.cfg`, `theme.cfg`, and the entire `live-theme/` directory (including `bee-logo.png`)
are **absent from the EFI image**. `live-build`'s `lb binary_grub-efi` stage is not
copying these files. GRUB boots, sources only `grub.cfg`, then fails to load the theme
because the file does not exist — returning a null bitmap regardless of PNG format.
All four fix attempts were targeting the wrong layer (PNG format/content).
## Fix (applied 2026-04-30)
Switched from PNG to TGA format:
1. Converted `bee-logo.png``bee-logo.tga` (24-bit uncompressed BGR, top-left origin,
480018 bytes). Conversion done via Python stdlib (no external tools needed).
2. `config.cfg`: `insmod png``insmod tga`
3. `theme.txt`: `file = "bee-logo.png"``file = "bee-logo.tga"`
**Why TGA works:** GRUB's TGA reader (`tga.mod`) handles uncompressed 24-bit images
trivially — no decompression, no complex chunk parsing. The module is present on-disk
(`x86_64-efi/tga.mod`). PNG was failing despite a valid file; the exact GRUB bug is
unknown but the PNG reader in Debian bookworm's grub2 is known to be fragile.
The old `bee-logo.png` is kept in the tree (may be useful for other tools) but is no
longer referenced by the theme.
## Relevant files
| File | Purpose |
|------|---------|
| `iso/builder/config/bootloaders/grub-efi/config.cfg` | insmod png, gfxterm init, theme source |
| `iso/builder/config/bootloaders/grub-efi/theme.cfg` | sets `theme=` path |
| `iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt` | image component definition |
| `iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png` | the logo PNG |

View File

@@ -867,6 +867,7 @@ needs_full_build() {
"${BUILDER_DIR}/config/package-lists" \
"${BUILDER_DIR}/config/hooks" \
"${BUILDER_DIR}/config/archives" \
"${BUILDER_DIR}/config/bootloaders" \
-newer "${FULL_BUILD_MARKER}" 2>/dev/null | head -1)
if [ -n "$_heavy" ]; then

View File

@@ -27,5 +27,5 @@ insmod gfxterm
terminal_input console serial
terminal_output gfxterm serial
insmod png
insmod tga
source /boot/grub/theme.cfg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 78 KiB

After

Width:  |  Height:  |  Size: 77 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 469 KiB

View File

@@ -9,7 +9,7 @@ terminal-font: "Unifont Regular 16"
+ image {
top = 4%
left = 50%-200
file = "bee-logo.png"
file = "bee-logo.tga"
}
#help bar at the bottom

View File

@@ -31,6 +31,7 @@ systemctl enable bee-preflight.service
systemctl enable bee-audit.service
systemctl enable bee-web.service
systemctl enable bee-sshsetup.service
systemctl enable bee-blackbox.service
systemctl enable bee-selfheal.timer
systemctl enable bee-boot-status.service
systemctl enable ssh.service

View File

@@ -47,18 +47,27 @@ vim-tiny
mc
htop
nvtop
btop
sudo
zstd
mstflint
memtester
stress-ng
stressapptest
# QR codes (for displaying audit results)
qrencode
fio
iperf3
iotop
nload
tcpdump
hdparm
sysstat
lsscsi
sg3-utils
jq
curl
net-tools
# Local desktop (openbox + chromium kiosk)
gparted
openbox
tint2
feh

BIN
iso/vendor/arcconf vendored Executable file

Binary file not shown.

BIN
iso/vendor/sas2ircu vendored Executable file

Binary file not shown.

BIN
iso/vendor/sas3ircu vendored Executable file

Binary file not shown.

BIN
iso/vendor/ssacli vendored Executable file

Binary file not shown.

BIN
iso/vendor/storcli64 vendored Executable file

Binary file not shown.

View File

@@ -1,74 +0,0 @@
#!/bin/sh
# fetch-vendor.sh — download proprietary vendor utilities into iso/vendor.
#
# Usage:
# STORCLI_URL=... STORCLI_SHA256=... \
# SAS2IRCU_URL=... SAS2IRCU_SHA256=... \
# SAS3IRCU_URL=... SAS3IRCU_SHA256=... \
# MSTFLINT_URL=... MSTFLINT_SHA256=... \
# sh scripts/fetch-vendor.sh
set -eu
ROOT_DIR=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)
OUT_DIR="$ROOT_DIR/iso/vendor"
mkdir -p "$OUT_DIR"
need_cmd() {
command -v "$1" >/dev/null 2>&1 || { echo "ERROR: required command not found: $1" >&2; exit 1; }
}
need_cmd sha256sum
download_to() {
url="$1"
out="$2"
if command -v wget >/dev/null 2>&1; then
wget -O "$out" "$url"
return 0
fi
if command -v curl >/dev/null 2>&1; then
curl -fsSL "$url" -o "$out"
return 0
fi
echo "ERROR: required command not found: wget or curl" >&2
exit 1
}
fetch_one() {
name="$1"
url="$2"
sha="$3"
if [ -z "$url" ] || [ -z "$sha" ]; then
echo "[vendor] skip $name (URL/SHA not provided)"
return 0
fi
dst="$OUT_DIR/$name"
tmp="$dst.tmp"
echo "[vendor] downloading $name"
download_to "$url" "$tmp"
got=$(sha256sum "$tmp" | awk '{print $1}')
want=$(echo "$sha" | tr '[:upper:]' '[:lower:]')
if [ "$got" != "$want" ]; then
rm -f "$tmp"
echo "ERROR: checksum mismatch for $name" >&2
echo " got: $got" >&2
echo " want: $want" >&2
exit 1
fi
mv "$tmp" "$dst"
chmod +x "$dst" || true
echo "[vendor] ok: $name"
}
fetch_one "storcli64" "${STORCLI_URL:-}" "${STORCLI_SHA256:-}"
fetch_one "sas2ircu" "${SAS2IRCU_URL:-}" "${SAS2IRCU_SHA256:-}"
fetch_one "sas3ircu" "${SAS3IRCU_URL:-}" "${SAS3IRCU_SHA256:-}"
fetch_one "mstflint" "${MSTFLINT_URL:-}" "${MSTFLINT_SHA256:-}"
echo "[vendor] done. output dir: $OUT_DIR"