Compare commits

..

6 Commits

Author SHA1 Message Date
Mikhail Chusavitin 0b8a2ff83f Add validate test matrix and GPU test methodology docs
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 10:47:08 +03:00
Mikhail Chusavitin 2c22b01fe3 Fix IPMI hangs, add VROC license, fix blackbox service, drop qrencode
IPMI hang fix (Lenovo XCC SR650 V3):
- Add pluggable ipmi_profile system with per-vendor timeouts and fruEarlyExit flag
- Lenovo profile: 90s FRU timeout, streaming early-exit stops after PSU blocks found
- collectFRUEarlyExit streams ipmitool fru print and kills process once PSU blocks
  are followed by a non-PSU header (~6s instead of ~108s on 54-device FRU list)
- collectBMCFirmware and collectPSUs accept manufacturer and apply profile timeouts

VROC license detection:
- Detect VMD/VROC controller in PCIe list, run mdadm --detail-platform
- Parse "License:" line; store as snap.VROCLicense in HardwareSnapshot

Blackbox service fix:
- bee-blackbox.service was missing from systemctl enable list in ISO build hook
- Service never started on boot; state file never written; UI button stayed "Enable"

Drop qrencode:
- Remove from package list, standardTools API check, and runtime-flows doc

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 10:46:59 +03:00
Mikhail Chusavitin ec89616585 Add storage block geometry to audit and viewer 2026-04-29 17:39:11 +03:00
Mikhail Chusavitin c0dbbf96ad Add vendor RAID tools for livecd 2026-04-29 17:31:25 +03:00
Mikhail Chusavitin 76484b123c Fix fast-path: treat bootloader config changes as heavy
config/bootloaders was missing from the needs_full_build heavy-file
list, so changes to GRUB theme assets (e.g. bee-logo.png RGBA→RGB fix
in 333c44f) were silently skipped by the squashfs-surgery fast-path.
The old broken PNG stayed in boot/grub/live-theme/ inside the ISO.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 15:36:29 +03:00
Mikhail Chusavitin 8901596152 Add server diagnostic tools to ISO, drop btop
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 13:18:50 +03:00
29 changed files with 936 additions and 146 deletions
+8 -11
View File
@@ -3,6 +3,7 @@ package collector
import ( import (
"bee/audit/internal/schema" "bee/audit/internal/schema"
"bufio" "bufio"
"context"
"log/slog" "log/slog"
"os" "os"
"os/exec" "os/exec"
@@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) {
return string(out), nil return string(out), nil
} }
var execIpmitool = func(args ...string) (string, error) {
out, err := exec.Command("ipmitool", args...).Output()
if err != nil {
return "", err
}
return string(out), nil
}
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record // collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
// plus the BIOS firmware entry. Any failure is logged and returns zero values. // plus the BIOS firmware entry. Any failure is logged and returns zero values.
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) { func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
@@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
// collectBMCFirmware collects BMC firmware version via ipmitool mc info. // collectBMCFirmware collects BMC firmware version via ipmitool mc info.
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs. // Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
func collectBMCFirmware() []schema.HardwareFirmwareRecord { func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord {
if _, err := exec.LookPath("ipmitool"); err != nil { if _, err := exec.LookPath("ipmitool"); err != nil {
return nil return nil
} }
if _, err := os.Stat("/dev/ipmi0"); err != nil { if _, err := os.Stat("/dev/ipmi0"); err != nil {
return nil return nil
} }
out, err := execIpmitool("mc", "info") profile := selectIPMIProfile(manufacturer)
ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info")
raw, err := cmd.Output()
if err != nil { if err != nil {
slog.Info("bmc: ipmitool mc info unavailable", "err", err) slog.Info("bmc: ipmitool mc info unavailable", "err", err)
return nil return nil
} }
version := parseBMCFirmwareRevision(out) version := parseBMCFirmwareRevision(string(raw))
if version == "" { if version == "" {
return nil return nil
} }
+4 -2
View File
@@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
board, biosFW := collectBoard() board, biosFW := collectBoard()
snap.Board = board snap.Board = board
snap.Firmware = append(snap.Firmware, biosFW...) snap.Firmware = append(snap.Firmware, biosFW...)
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...) snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)
snap.CPUs = collectCPUs() snap.CPUs = collectCPUs()
@@ -34,6 +34,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
} }
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc) snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc) snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
bestEffortRescanHotplugStorage()
snap.Storage = collectStorage() snap.Storage = collectStorage()
snap.PCIeDevices = collectPCIe() snap.PCIeDevices = collectPCIe()
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
@@ -44,7 +45,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices) snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices)) snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
snap.PowerSupplies = collectPSUs() snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc) snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
snap.Sensors = buildSensorsFromDoc(sensorDoc) snap.Sensors = buildSensorsFromDoc(sensorDoc)
finalizeSnapshot(&snap, collectedAt) finalizeSnapshot(&snap, collectedAt)
+92
View File
@@ -0,0 +1,92 @@
package collector
// Package-level IPMI tuning profiles.
//
// Each profile is matched by board manufacturer (already known before PSU
// collection runs). The profile drives two things:
// - Per-command timeouts — prevents infinite hangs on slow BMCs.
// - FRU early-exit — streaming parser stops reading once all PSU entries
// are found, avoiding the tail of non-PSU FRU records.
//
// To add a new vendor: append to ipmiProfiles. The first matching entry wins.
import (
"strings"
"time"
)
// ipmiProfile holds tuning parameters for one or more board manufacturers.
type ipmiProfile struct {
// name is shown in log messages.
name string
// manufacturers is a list of lowercase substrings matched against the
// board manufacturer string from dmidecode type 1.
manufacturers []string
// fruTimeout is the hard deadline for the entire `ipmitool fru print`
// command. Zero means no timeout (not recommended).
fruTimeout time.Duration
// sdrTimeout is the hard deadline for `ipmitool sdr`.
sdrTimeout time.Duration
// mcInfoTimeout is the hard deadline for `ipmitool mc info`.
mcInfoTimeout time.Duration
// fruEarlyExit instructs the streaming FRU parser to stop reading
// after it has found at least one PSU entry and the current block is
// complete. Useful on servers with many non-PSU FRU devices.
fruEarlyExit bool
}
// ipmiProfiles is the ordered list of profiles. First match wins.
var ipmiProfiles = []ipmiProfile{
{
// Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series).
// SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the
// full `fru print` scan takes ~108 s on a loaded BMC. Enable early
// exit so collection stops once PSU records are found.
name: "lenovo",
manufacturers: []string{"lenovo"},
fruTimeout: 90 * time.Second,
sdrTimeout: 45 * time.Second,
mcInfoTimeout: 15 * time.Second,
fruEarlyExit: true,
},
{
// HPE iLO-based servers (ProLiant DL/ML/BL).
name: "hpe",
manufacturers: []string{"hp", "hewlett packard"},
fruTimeout: 60 * time.Second,
sdrTimeout: 30 * time.Second,
mcInfoTimeout: 10 * time.Second,
fruEarlyExit: false,
},
{
// Dell iDRAC-based servers.
name: "dell",
manufacturers: []string{"dell"},
fruTimeout: 60 * time.Second,
sdrTimeout: 30 * time.Second,
mcInfoTimeout: 10 * time.Second,
fruEarlyExit: false,
},
}
// defaultIPMIProfile is used when no vendor profile matches.
var defaultIPMIProfile = ipmiProfile{
name: "default",
fruTimeout: 60 * time.Second,
sdrTimeout: 30 * time.Second,
mcInfoTimeout: 10 * time.Second,
fruEarlyExit: false,
}
// selectIPMIProfile returns the profile for the given board manufacturer.
func selectIPMIProfile(manufacturer string) ipmiProfile {
mfgLower := strings.ToLower(strings.TrimSpace(manufacturer))
for _, p := range ipmiProfiles {
for _, m := range p.manufacturers {
if strings.Contains(mfgLower, m) {
return p
}
}
}
return defaultIPMIProfile
}
+80 -6
View File
@@ -2,6 +2,8 @@ package collector
import ( import (
"bee/audit/internal/schema" "bee/audit/internal/schema"
"bufio"
"context"
"log/slog" "log/slog"
"os/exec" "os/exec"
"regexp" "regexp"
@@ -10,16 +12,29 @@ import (
"strings" "strings"
) )
func collectPSUs() []schema.HardwarePowerSupply { func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
profile := selectIPMIProfile(manufacturer)
var psus []schema.HardwarePowerSupply var psus []schema.HardwarePowerSupply
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil { fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
psus = parseFRU(string(out)) defer fruCancel()
if profile.fruEarlyExit {
psus = collectFRUEarlyExit(fruCtx)
} else { } else {
slog.Info("psu: fru unavailable", "err", err) cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
if out, err := cmd.Output(); err == nil {
psus = parseFRU(string(out))
} else {
slog.Info("psu: fru unavailable", "err", err)
}
} }
sdrData := map[int]psuSDR{} sdrData := map[int]psuSDR{}
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil { sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
defer sdrCancel()
cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
if sdrOut, err := cmd.Output(); err == nil {
sdrData = parsePSUSDR(string(sdrOut)) sdrData = parsePSUSDR(string(sdrOut))
if len(psus) == 0 { if len(psus) == 0 {
psus = synthesizePSUsFromSDR(sdrData) psus = synthesizePSUsFromSDR(sdrData)
@@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply {
slog.Info("psu: ipmitool unavailable, skipping", "err", err) slog.Info("psu: ipmitool unavailable, skipping", "err", err)
return nil return nil
} }
slog.Info("psu: collected", "count", len(psus)) slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
return psus
}
// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
// as soon as it has found all PSU blocks and the next block is not a PSU.
// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
pipe, err := cmd.StdoutPipe()
if err != nil {
slog.Info("psu: fru pipe unavailable", "err", err)
return nil
}
if err := cmd.Start(); err != nil {
slog.Info("psu: fru start failed", "err", err)
return nil
}
var psus []schema.HardwarePowerSupply
var currentBlock strings.Builder
slot := 0
psuFound := false
stoppedEarly := false
scanner := bufio.NewScanner(pipe)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "FRU Device Description") {
if currentBlock.Len() > 0 {
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
psus = append(psus, psu)
psuFound = true
slot++
}
currentBlock.Reset()
}
// Stop once we've collected PSUs and hit a non-PSU block header.
if psuFound && !isPSUHeader(strings.ToLower(line)) {
stoppedEarly = true
break
}
}
currentBlock.WriteString(line)
currentBlock.WriteByte('\n')
}
if !stoppedEarly && currentBlock.Len() > 0 {
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
psus = append(psus, psu)
}
}
// Kill the process immediately on early exit rather than waiting for context timeout.
if cmd.Process != nil {
cmd.Process.Kill() //nolint:errcheck
}
cmd.Wait() //nolint:errcheck
slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
return psus return psus
} }
+31
View File
@@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray {
return arrays return arrays
} }
// collectVROCLicense runs mdadm --detail-platform and extracts the License field.
// Returns nil when VROC is absent or the platform does not report a license.
func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string {
if !hasVROCController(pcie) {
return nil
}
out, err := raidToolQuery("mdadm", "--detail-platform")
if err != nil {
slog.Info("vroc: mdadm --detail-platform unavailable", "err", err)
return nil
}
return parseMDAdmPlatformLicense(string(out))
}
func parseMDAdmPlatformLicense(raw string) *string {
for _, line := range strings.Split(raw, "\n") {
trimmed := strings.TrimSpace(line)
if !strings.HasPrefix(strings.ToLower(trimmed), "license") {
continue
}
if idx := strings.Index(trimmed, ":"); idx >= 0 {
val := strings.TrimSpace(trimmed[idx+1:])
if val != "" {
v := strings.ToLower(val)
return &v
}
}
}
return nil
}
func queryDeviceSerial(devPath string) string { func queryDeviceSerial(devPath string) string {
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil { if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
var ctrl nvmeIDCtrl var ctrl nvmeIDCtrl
+171 -1
View File
@@ -4,12 +4,52 @@ import (
"bee/audit/internal/schema" "bee/audit/internal/schema"
"encoding/json" "encoding/json"
"log/slog" "log/slog"
"os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"regexp"
"strconv" "strconv"
"strings" "strings"
) )
var (
pciRescanPath = "/sys/bus/pci/rescan"
scsiHostScanGlob = "/sys/class/scsi_host/host*/scan"
hotplugWriteFile = os.WriteFile
hotplugExecCommand = exec.Command
hotplugGlob = filepath.Glob
nvmeLBAFCompactRE = regexp.MustCompile(`(?im)^\s*lbaf\s+\d+\s*:\s*ms:(\d+)\s+lbads:(\d+).*?\(in use\)\s*$`)
nvmeLBAFVerboseRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+\d+\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*?\(in use\)\s*$`)
sgReadcapBlockRE = regexp.MustCompile(`(?im)logical block length\s*=\s*(\d+)\s+bytes`)
sgReadcapProtRE = regexp.MustCompile(`(?im)prot_en\s*=\s*1`)
)
func bestEffortRescanHotplugStorage() {
if err := hotplugWriteFile(pciRescanPath, []byte("1\n"), 0644); err != nil {
slog.Info("storage: pci rescan skipped", "path", pciRescanPath, "err", err)
} else {
slog.Info("storage: triggered pci rescan for hotplug discovery")
}
hostPaths, err := hotplugGlob(scsiHostScanGlob)
if err != nil {
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
} else {
for _, path := range hostPaths {
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
continue
}
slog.Info("storage: triggered scsi host scan", "path", path)
}
}
out, err := hotplugExecCommand("udevadm", "settle", "--timeout=10").CombinedOutput()
if err != nil {
slog.Info("storage: udev settle after hotplug rescan failed", "err", err, "output", strings.TrimSpace(string(out)))
}
}
func collectStorage() []schema.HardwareStorage { func collectStorage() []schema.HardwareStorage {
devs := discoverStorageDevices() devs := discoverStorageDevices()
result := make([]schema.HardwareStorage, 0, len(devs)) result := make([]schema.HardwareStorage, 0, len(devs))
@@ -35,6 +75,8 @@ type lsblkDevice struct {
Model string `json:"model"` Model string `json:"model"`
Tran string `json:"tran"` Tran string `json:"tran"`
Hctl string `json:"hctl"` Hctl string `json:"hctl"`
LogSec string `json:"log-sec"`
PhySec string `json:"phy-sec"`
} }
type lsblkRoot struct { type lsblkRoot struct {
@@ -101,7 +143,7 @@ func isVirtualHDiskModel(model string) bool {
func lsblkDevices() []lsblkDevice { func lsblkDevices() []lsblkDevice {
out, err := exec.Command("lsblk", "-J", "-d", out, err := exec.Command("lsblk", "-J", "-d",
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output() "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL,LOG-SEC,PHY-SEC").Output()
if err != nil { if err != nil {
slog.Warn("storage: lsblk failed", "err", err) slog.Warn("storage: lsblk failed", "err", err)
return nil return nil
@@ -208,6 +250,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
present := true present := true
s := schema.HardwareStorage{Present: &present} s := schema.HardwareStorage{Present: &present}
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name} s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
applyStorageBlockGeometry(&s, dev)
tran := strings.ToLower(dev.Tran) tran := strings.ToLower(dev.Tran)
devPath := "/dev/" + dev.Name devPath := "/dev/" + dev.Name
@@ -327,6 +370,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
lifeRemainingPct: lifeRemaining, lifeRemainingPct: lifeRemaining,
} }
applySCSISmartctlTelemetry(&s, raw, &status) applySCSISmartctlTelemetry(&s, raw, &status)
applySCSIProtectionBlockGeometry(&s, devPath)
setStorageHealthStatus(&s, status) setStorageHealthStatus(&s, status)
return s return s
} }
@@ -374,6 +418,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
Interface: &iface, Interface: &iface,
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name}, Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
} }
applyStorageBlockGeometry(&s, dev)
devPath := "/dev/" + dev.Name devPath := "/dev/" + dev.Name
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" { if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
@@ -408,6 +453,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
} }
} }
} }
applyNVMeBlockGeometry(&s, devPath)
// smart-log: wear telemetry // smart-log: wear telemetry
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil { if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
@@ -540,6 +586,19 @@ func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, s
"path:user_capacity.block_size", "path:user_capacity.block_size",
) )
if hasBlockSize && blockSize > 0 { if hasBlockSize && blockSize > 0 {
if s.LogicalBlockSizeBytes == nil {
s.LogicalBlockSizeBytes = &blockSize
}
if s.MetadataBytesPerBlock == nil {
zero := int64(0)
s.MetadataBytesPerBlock = &zero
}
if s.Telemetry == nil {
s.Telemetry = map[string]any{}
}
s.Telemetry["logical_block_size_bytes"] = *s.LogicalBlockSizeBytes
s.Telemetry["metadata_bytes_per_block"] = *s.MetadataBytesPerBlock
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
if v, ok := firstInt64(raw, if v, ok := firstInt64(raw,
"path:logical_blocks_written", "path:logical_blocks_written",
"path:total_lbas_written", "path:total_lbas_written",
@@ -557,6 +616,117 @@ func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, s
} }
} }
func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
if s == nil {
return
}
logical := parseStorageBytes(dev.LogSec)
physical := parseStorageBytes(dev.PhySec)
if logical <= 0 && physical <= 0 {
return
}
if s.Telemetry == nil {
s.Telemetry = map[string]any{}
}
if logical > 0 {
s.LogicalBlockSizeBytes = &logical
s.Telemetry["logical_block_size_bytes"] = logical
if s.MetadataBytesPerBlock == nil {
zero := int64(0)
s.MetadataBytesPerBlock = &zero
s.Telemetry["metadata_bytes_per_block"] = zero
}
}
if physical > 0 {
s.PhysicalBlockSizeBytes = &physical
s.Telemetry["physical_block_size_bytes"] = physical
}
if s.LogicalBlockSizeBytes != nil && s.MetadataBytesPerBlock != nil {
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
}
}
func applyNVMeBlockGeometry(s *schema.HardwareStorage, devPath string) {
if s == nil || strings.TrimSpace(devPath) == "" {
return
}
out, err := exec.Command("nvme", "id-ns", devPath, "-H").CombinedOutput()
if err != nil {
return
}
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(string(out))
if !ok {
return
}
setStorageBlockGeometry(s, dataBytes, metadataBytes)
}
func applySCSIProtectionBlockGeometry(s *schema.HardwareStorage, devPath string) {
if s == nil || strings.TrimSpace(devPath) == "" {
return
}
out, err := exec.Command("sg_readcap", "-l", devPath).CombinedOutput()
if err != nil {
return
}
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(string(out))
if !ok {
return
}
setStorageBlockGeometry(s, dataBytes, metadataBytes)
}
func setStorageBlockGeometry(s *schema.HardwareStorage, dataBytes, metadataBytes int64) {
if s == nil || dataBytes <= 0 || metadataBytes < 0 {
return
}
if s.Telemetry == nil {
s.Telemetry = map[string]any{}
}
s.LogicalBlockSizeBytes = &dataBytes
s.MetadataBytesPerBlock = &metadataBytes
s.Telemetry["logical_block_size_bytes"] = dataBytes
s.Telemetry["metadata_bytes_per_block"] = metadataBytes
s.Telemetry["block_format"] = formatBlockFormat(dataBytes, metadataBytes)
}
func formatBlockFormat(dataBytes, metadataBytes int64) string {
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
}
func parseNVMeBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
if m := nvmeLBAFCompactRE.FindStringSubmatch(raw); len(m) == 3 {
ms, errMS := strconv.ParseInt(m[1], 10, 64)
lbads, errLBADS := strconv.ParseInt(m[2], 10, 64)
if errMS == nil && errLBADS == nil && lbads >= 0 && lbads < 63 {
return 1 << lbads, ms, true
}
}
if m := nvmeLBAFVerboseRE.FindStringSubmatch(raw); len(m) == 3 {
ms, errMS := strconv.ParseInt(m[1], 10, 64)
ds, errDS := strconv.ParseInt(m[2], 10, 64)
if errMS == nil && errDS == nil && ds > 0 {
return ds, ms, true
}
}
return 0, 0, false
}
func parseSCSIBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
m := sgReadcapBlockRE.FindStringSubmatch(raw)
if len(m) != 2 {
return 0, 0, false
}
blockBytes, err := strconv.ParseInt(m[1], 10, 64)
if err != nil || blockBytes <= 0 {
return 0, 0, false
}
if sgReadcapProtRE.MatchString(raw) {
return blockBytes, 8, true
}
return blockBytes, 0, true
}
func firstInt64(root map[string]any, candidates ...string) (int64, bool) { func firstInt64(root map[string]any, candidates ...string) (int64, bool) {
for _, candidate := range candidates { for _, candidate := range candidates {
if !strings.HasPrefix(candidate, "path:") { if !strings.HasPrefix(candidate, "path:") {
@@ -0,0 +1,69 @@
package collector
import "testing"
func TestParseNVMeBlockFormatCompact(t *testing.T) {
t.Parallel()
raw := `
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
lbaf 1 : ms:8 lbads:9 rp:0x1
`
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
if !ok {
t.Fatal("parseNVMeBlockFormat returned ok=false")
}
if dataBytes != 512 || metadataBytes != 0 {
t.Fatalf("got %d+%d want 512+0", dataBytes, metadataBytes)
}
}
func TestParseNVMeBlockFormatVerbose(t *testing.T) {
t.Parallel()
raw := `
LBA Format 0 : Metadata Size: 8 bytes - Data Size: 512 bytes - Relative Performance: 0 Better (in use)
LBA Format 1 : Metadata Size: 0 bytes - Data Size: 4096 bytes - Relative Performance: 1 Best
`
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
if !ok {
t.Fatal("parseNVMeBlockFormat returned ok=false")
}
if dataBytes != 512 || metadataBytes != 8 {
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
}
}
func TestParseSCSIBlockFormatWithProtection(t *testing.T) {
t.Parallel()
raw := `
Read Capacity results:
Protection: prot_en=1, p_type=1, p_i_exponent=0
Logical block length=512 bytes
`
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
if !ok {
t.Fatal("parseSCSIBlockFormat returned ok=false")
}
if dataBytes != 512 || metadataBytes != 8 {
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
}
}
func TestParseSCSIBlockFormatWithoutProtection(t *testing.T) {
t.Parallel()
raw := `
Read Capacity results:
Protection: prot_en=0, p_type=0, p_i_exponent=0
Logical block length=4096 bytes
`
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
if !ok {
t.Fatal("parseSCSIBlockFormat returned ok=false")
}
if dataBytes != 4096 || metadataBytes != 0 {
t.Fatalf("got %d+%d want 4096+0", dataBytes, metadataBytes)
}
}
@@ -1,6 +1,12 @@
package collector package collector
import "testing" import (
"os"
"os/exec"
"path/filepath"
"strings"
"testing"
)
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) { func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
t.Parallel() t.Parallel()
@@ -31,3 +37,82 @@ func TestParseStorageBytes(t *testing.T) {
t.Fatalf("parseStorageBytes invalid=%d want 0", got) t.Fatalf("parseStorageBytes invalid=%d want 0", got)
} }
} }
func TestBestEffortRescanHotplugStorage(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
rescanPath := filepath.Join(tmp, "pci-rescan")
scanDir := filepath.Join(tmp, "scsi_host")
host0Path := filepath.Join(scanDir, "host0", "scan")
host1Path := filepath.Join(scanDir, "host1", "scan")
argsPath := filepath.Join(tmp, "udevadm-args")
toolPath := filepath.Join(tmp, "udevadm")
if err := os.MkdirAll(filepath.Dir(host0Path), 0755); err != nil {
t.Fatalf("mkdir host0: %v", err)
}
if err := os.MkdirAll(filepath.Dir(host1Path), 0755); err != nil {
t.Fatalf("mkdir host1: %v", err)
}
if err := os.WriteFile(host0Path, nil, 0644); err != nil {
t.Fatalf("touch host0 scan: %v", err)
}
if err := os.WriteFile(host1Path, nil, 0644); err != nil {
t.Fatalf("touch host1 scan: %v", err)
}
script := "#!/bin/sh\nprintf '%s' \"$*\" > \"" + argsPath + "\"\n"
if err := os.WriteFile(toolPath, []byte(script), 0755); err != nil {
t.Fatalf("write udevadm stub: %v", err)
}
oldPath := os.Getenv("PATH")
if err := os.Setenv("PATH", tmp+string(os.PathListSeparator)+oldPath); err != nil {
t.Fatalf("set PATH: %v", err)
}
defer func() { _ = os.Setenv("PATH", oldPath) }()
oldRescanPath := pciRescanPath
oldSCSIGlob := scsiHostScanGlob
oldWriteFile := hotplugWriteFile
oldExecCommand := hotplugExecCommand
oldGlob := hotplugGlob
pciRescanPath = rescanPath
scsiHostScanGlob = filepath.Join(scanDir, "host*", "scan")
hotplugWriteFile = os.WriteFile
hotplugExecCommand = exec.Command
hotplugGlob = filepath.Glob
defer func() {
pciRescanPath = oldRescanPath
scsiHostScanGlob = oldSCSIGlob
hotplugWriteFile = oldWriteFile
hotplugExecCommand = oldExecCommand
hotplugGlob = oldGlob
}()
bestEffortRescanHotplugStorage()
raw, err := os.ReadFile(rescanPath)
if err != nil {
t.Fatalf("read rescan file: %v", err)
}
if string(raw) != "1\n" {
t.Fatalf("rescan payload=%q want %q", string(raw), "1\n")
}
for _, path := range []string{host0Path, host1Path} {
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read scsi scan file %s: %v", path, err)
}
if string(raw) != "- - -\n" {
t.Fatalf("scsi scan payload at %s =%q want %q", path, string(raw), "- - -\n")
}
}
args, err := os.ReadFile(argsPath)
if err != nil {
t.Fatalf("read udevadm args: %v", err)
}
if got := strings.TrimSpace(string(args)); got != "settle --timeout=10" {
t.Fatalf("udevadm args=%q want %q", got, "settle --timeout=10")
}
}
@@ -40,6 +40,12 @@ func TestApplySCSISmartctlTelemetry(t *testing.T) {
if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 { if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 {
t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes) t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes)
} }
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 4096 {
t.Fatalf("logical_block_size_bytes=%v want 4096", disk.LogicalBlockSizeBytes)
}
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
}
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 { if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 {
t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct) t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct)
} }
@@ -80,6 +86,12 @@ func TestApplySCSISmartctlTelemetryDoesNotOverwriteExistingValues(t *testing.T)
if *disk.WrittenBytes != 20 { if *disk.WrittenBytes != 20 {
t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes) t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes)
} }
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 512 {
t.Fatalf("logical_block_size_bytes=%v want 512", disk.LogicalBlockSizeBytes)
}
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
}
if *disk.LifeRemainingPct != 30 { if *disk.LifeRemainingPct != 30 {
t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct) t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct)
} }
+29
View File
@@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
} }
} }
func TestParseMDAdmPlatformLicense(t *testing.T) {
premium := `Platform : Intel(R) Virtual RAID on CPU
Version : 1.3.0.1138
RAID Levels : raid0 raid1 raid5 raid10
Total Disks : 4
License : Premium
`
got := parseMDAdmPlatformLicense(premium)
if got == nil || *got != "premium" {
t.Fatalf("expected 'premium', got %v", got)
}
standard := `Platform : Intel(R) Virtual RAID on CPU
License : Standard
`
got = parseMDAdmPlatformLicense(standard)
if got == nil || *got != "standard" {
t.Fatalf("expected 'standard', got %v", got)
}
noLicense := `Platform : Intel(R) Virtual RAID on CPU
Version : 1.0.0
`
got = parseMDAdmPlatformLicense(noLicense)
if got != nil {
t.Fatalf("expected nil, got %v", *got)
}
}
func TestHasVROCController(t *testing.T) { func TestHasVROCController(t *testing.T) {
intel := vendorIntel intel := vendorIntel
model := "Volume Management Device NVMe RAID Controller" model := "Volume Management Device NVMe RAID Controller"
+28 -24
View File
@@ -66,6 +66,7 @@ type HardwareSnapshot struct {
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"` PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
Sensors *HardwareSensors `json:"sensors,omitempty"` Sensors *HardwareSensors `json:"sensors,omitempty"`
EventLogs []HardwareEventLog `json:"event_logs,omitempty"` EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
VROCLicense *string `json:"vroc_license,omitempty"`
} }
type HardwareHealthSummary struct { type HardwareHealthSummary struct {
@@ -143,30 +144,33 @@ type HardwareMemory struct {
type HardwareStorage struct { type HardwareStorage struct {
HardwareComponentStatus HardwareComponentStatus
Slot *string `json:"slot,omitempty"` Slot *string `json:"slot,omitempty"`
Type *string `json:"type,omitempty"` Type *string `json:"type,omitempty"`
Model *string `json:"model,omitempty"` Model *string `json:"model,omitempty"`
SizeGB *int `json:"size_gb,omitempty"` SizeGB *int `json:"size_gb,omitempty"`
SerialNumber *string `json:"serial_number,omitempty"` LogicalBlockSizeBytes *int64 `json:"logical_block_size_bytes,omitempty"`
Manufacturer *string `json:"manufacturer,omitempty"` PhysicalBlockSizeBytes *int64 `json:"physical_block_size_bytes,omitempty"`
Firmware *string `json:"firmware,omitempty"` MetadataBytesPerBlock *int64 `json:"metadata_bytes_per_block,omitempty"`
Interface *string `json:"interface,omitempty"` SerialNumber *string `json:"serial_number,omitempty"`
Present *bool `json:"present,omitempty"` Manufacturer *string `json:"manufacturer,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"` Firmware *string `json:"firmware,omitempty"`
PowerOnHours *int64 `json:"power_on_hours,omitempty"` Interface *string `json:"interface,omitempty"`
PowerCycles *int64 `json:"power_cycles,omitempty"` Present *bool `json:"present,omitempty"`
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"` TemperatureC *float64 `json:"temperature_c,omitempty"`
MediaErrors *int64 `json:"media_errors,omitempty"` PowerOnHours *int64 `json:"power_on_hours,omitempty"`
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"` PowerCycles *int64 `json:"power_cycles,omitempty"`
WrittenBytes *int64 `json:"written_bytes,omitempty"` UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
ReadBytes *int64 `json:"read_bytes,omitempty"` MediaErrors *int64 `json:"media_errors,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"` ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"` WrittenBytes *int64 `json:"written_bytes,omitempty"`
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"` ReadBytes *int64 `json:"read_bytes,omitempty"`
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"` LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"` LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"` AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
Telemetry map[string]any `json:"-"` ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
Telemetry map[string]any `json:"-"`
} }
type HardwarePCIeDevice struct { type HardwarePCIeDevice struct {
+15 -6
View File
@@ -50,6 +50,9 @@ func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
writtenBytes := int64(9876543210) writtenBytes := int64(9876543210)
readBytes := int64(1234567890) readBytes := int64(1234567890)
lifeRemainingPct := 91.0 lifeRemainingPct := 91.0
logicalBlockSizeBytes := int64(512)
physicalBlockSizeBytes := int64(4096)
metadataBytesPerBlock := int64(8)
payload := HardwareIngestRequest{ payload := HardwareIngestRequest{
CollectedAt: "2026-03-15T15:00:00Z", CollectedAt: "2026-03-15T15:00:00Z",
@@ -57,12 +60,15 @@ func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
Board: HardwareBoard{SerialNumber: "SRV-001"}, Board: HardwareBoard{SerialNumber: "SRV-001"},
Storage: []HardwareStorage{ Storage: []HardwareStorage{
{ {
SerialNumber: stringPtr("DISK-001"), SerialNumber: stringPtr("DISK-001"),
Model: stringPtr("TestDisk"), Model: stringPtr("TestDisk"),
PowerOnHours: &powerOnHours, LogicalBlockSizeBytes: &logicalBlockSizeBytes,
WrittenBytes: &writtenBytes, PhysicalBlockSizeBytes: &physicalBlockSizeBytes,
ReadBytes: &readBytes, MetadataBytesPerBlock: &metadataBytesPerBlock,
LifeRemainingPct: &lifeRemainingPct, PowerOnHours: &powerOnHours,
WrittenBytes: &writtenBytes,
ReadBytes: &readBytes,
LifeRemainingPct: &lifeRemainingPct,
}, },
}, },
}, },
@@ -75,6 +81,9 @@ func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
text := string(data) text := string(data)
for _, needle := range []string{ for _, needle := range []string{
`"storage":[{`, `"storage":[{`,
`"logical_block_size_bytes":512`,
`"physical_block_size_bytes":4096`,
`"metadata_bytes_per_block":8`,
`"power_on_hours":12450`, `"power_on_hours":12450`,
`"written_bytes":9876543210`, `"written_bytes":9876543210`,
`"read_bytes":1234567890`, `"read_bytes":1234567890`,
+1 -1
View File
@@ -1295,7 +1295,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
var standardTools = []string{ var standardTools = []string{
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool", "dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop", "nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
"mstflint", "qrencode", "mstflint",
} }
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
+1
View File
@@ -572,6 +572,7 @@ func (h *handler) handleExportIndex(w http.ResponseWriter, r *http.Request) {
func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) { func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) {
snapshot, _ := loadSnapshot(h.opts.AuditPath) snapshot, _ := loadSnapshot(h.opts.AuditPath)
snapshot = enrichSnapshotForViewer(snapshot)
body, err := viewer.RenderHTML(snapshot, h.opts.Title) body, err := viewer.RenderHTML(snapshot, h.opts.Title)
if err != nil { if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError) http.Error(w, err.Error(), http.StatusInternalServerError)
+63
View File
@@ -1016,6 +1016,39 @@ func TestViewerRendersLatestSnapshot(t *testing.T) {
} }
} }
func TestViewerRendersDerivedStorageBlockFormat(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
body := `{
"collected_at":"2026-04-29T00:05:00Z",
"hardware":{
"board":{"serial_number":"SERIAL-NEW"},
"storage":[
{
"serial_number":"DISK-1",
"model":"Test NVMe",
"logical_block_size_bytes":512,
"physical_block_size_bytes":4096,
"metadata_bytes_per_block":8
}
]
}
}`
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{AuditPath: path})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/viewer", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
if !strings.Contains(rec.Body.String(), "512&#43;8") {
t.Fatalf("viewer body missing derived block format: %s", rec.Body.String())
}
}
func TestAuditJSONServesLatestSnapshot(t *testing.T) { func TestAuditJSONServesLatestSnapshot(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
path := filepath.Join(dir, "audit.json") path := filepath.Join(dir, "audit.json")
@@ -1038,6 +1071,36 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
} }
} }
func TestAuditJSONDoesNotInjectDerivedStorageBlockFormat(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
body := `{
"hardware":{
"board":{"serial_number":"SERIAL-API"},
"storage":[
{
"serial_number":"DISK-1",
"logical_block_size_bytes":512,
"metadata_bytes_per_block":8
}
]
}
}`
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{AuditPath: path})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit.json", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
if strings.Contains(rec.Body.String(), "block_format") {
t.Fatalf("audit.json should remain contract-only: %s", rec.Body.String())
}
}
func TestMissingAuditJSONReturnsNotFound(t *testing.T) { func TestMissingAuditJSONReturnsNotFound(t *testing.T) {
handler := NewHandler(HandlerOptions{AuditPath: "/missing/audit.json"}) handler := NewHandler(HandlerOptions{AuditPath: "/missing/audit.json"})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
+62
View File
@@ -0,0 +1,62 @@
package webui
import (
"encoding/json"
"strconv"
)
func enrichSnapshotForViewer(snapshot []byte) []byte {
if len(snapshot) == 0 {
return snapshot
}
var root map[string]any
if err := json.Unmarshal(snapshot, &root); err != nil {
return snapshot
}
hardware, _ := root["hardware"].(map[string]any)
if len(hardware) == 0 {
return snapshot
}
storage, _ := hardware["storage"].([]any)
if len(storage) == 0 {
return snapshot
}
changed := false
for _, item := range storage {
row, _ := item.(map[string]any)
if len(row) == 0 {
continue
}
if _, exists := row["block_format"]; exists {
continue
}
logical, okLogical := jsonNumberToInt64(row["logical_block_size_bytes"])
metadata, okMetadata := jsonNumberToInt64(row["metadata_bytes_per_block"])
if !okLogical || !okMetadata || logical <= 0 || metadata < 0 {
continue
}
row["block_format"] = strconv.FormatInt(logical, 10) + "+" + strconv.FormatInt(metadata, 10)
changed = true
}
if !changed {
return snapshot
}
out, err := json.Marshal(root)
if err != nil {
return snapshot
}
return out
}
func jsonNumberToInt64(v any) (int64, bool) {
switch x := v.(type) {
case float64:
return int64(x), true
case int64:
return x, true
case int:
return int64(x), true
default:
return 0, false
}
}
+57
View File
@@ -9,5 +9,62 @@ Generic engineering rules live in `bible/rules/patterns/`.
|---|---| |---|---|
| `architecture/system-overview.md` | What bee does, scope, tech stack | | `architecture/system-overview.md` | What bee does, scope, tech stack |
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order | | `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
| `docs/customer-gpu-test-methodology.md` | Customer-facing GPU PCIe Validate / Validate -> Stress test list |
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract | | `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
| `decisions/` | Architectural decision log, including read-only submodule policy | | `decisions/` | Architectural decision log, including read-only submodule policy |
## Validate Test Matrix
### Validate
- CPU check
- `lscpu`
- `sensors`
- `stress-ng`
- Memory check
- `free`
- `timeout <timeout_sec> memtester`
- `free`
- NVMe storage check
- `nvme id-ctrl`
- `nvme smart-log`
- `nvme device-self-test`
- SATA/SAS storage check
- `smartctl -H -A`
- `smartctl -t short`
- Basic NVIDIA GPU check
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dmidecode -t baseboard`
- `dmidecode -t system`
- `dcgmi diag -r 2`
- Inter-GPU communication check
- `all_reduce_perf`
- GPU bandwidth check
- `dcgmi diag -r nvbandwidth`
### Validate -> Stress
- Extended NVIDIA GPU check
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dmidecode -t baseboard`
- `dmidecode -t system`
- `dcgmi diag -r 3`
- NVIDIA targeted stress
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r targeted_stress`
- NVIDIA targeted power
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r targeted_power`
- NVIDIA pulse test
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r pulse_test`
- Inter-GPU communication check
- `all_reduce_perf`
- GPU bandwidth check
- `dcgmi diag -r nvbandwidth`
@@ -149,7 +149,6 @@ Current validation state:
6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0) 6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded) 7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
8. output JSON → /var/log/bee-audit.json 8. output JSON → /var/log/bee-audit.json
9. QR summary to stdout (qrencode if available)
``` ```
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal. Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
@@ -0,0 +1,54 @@
# GPU PCIe Test Methodology
## Validate
- CPU check
- `lscpu`
- `sensors`
- `stress-ng`
- Memory check
- `free`
- `timeout <timeout_sec> memtester`
- `free`
- NVMe storage check
- `nvme id-ctrl`
- `nvme smart-log`
- `nvme device-self-test`
- SATA/SAS storage check
- `smartctl -H -A`
- `smartctl -t short`
- Basic NVIDIA GPU check
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dmidecode -t baseboard`
- `dmidecode -t system`
- `dcgmi diag -r 2`
- Inter-GPU communication check
- `all_reduce_perf`
- GPU bandwidth check
- `dcgmi diag -r nvbandwidth`
## Validate -> Stress
- Extended NVIDIA GPU check
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dmidecode -t baseboard`
- `dmidecode -t system`
- `dcgmi diag -r 3`
- NVIDIA targeted stress
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r targeted_stress`
- NVIDIA targeted power
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r targeted_power`
- NVIDIA pulse test
- `nvidia-smi -pm 1`
- `nvidia-smi -q`
- `dcgmi diag -r pulse_test`
- Inter-GPU communication check
- `all_reduce_perf`
- GPU bandwidth check
- `dcgmi diag -r nvbandwidth`
+58 -15
View File
@@ -1,7 +1,7 @@
--- ---
title: Hardware Ingest JSON Contract title: Hardware Ingest JSON Contract
version: "2.7" version: "2.10"
updated: "2026-03-15" updated: "2026-04-29"
maintainer: Reanimator Core maintainer: Reanimator Core
audience: external-integrators, ai-agents audience: external-integrators, ai-agents
language: ru language: ru
@@ -9,7 +9,7 @@ language: ru
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения # Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
Версия: **2.7** · Дата: **2026-03-15** Версия: **2.10** · Дата: **2026-04-29**
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения). Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов. Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
@@ -22,6 +22,9 @@ language: ru
| Версия | Дата | Изменения | | Версия | Дата | Изменения |
|--------|------|-----------| |--------|------|-----------|
| 2.10 | 2026-04-29 | Для `hardware.storage[]` добавлены необязательные числовые поля `logical_block_size_bytes`, `physical_block_size_bytes`, `metadata_bytes_per_block` для нормализованного описания формата блока накопителя |
| 2.9 | 2026-03-19 | Добавлена необязательная секция `hardware.platform_config` — произвольный объект с настройками платформы (BIOS/Redfish); хранится как latest-snapshot per machine |
| 2.8 | 2026-03-15 | Поле `location` удалено из всех `sensors.*`; сенсоры передаются только по `name` и измеренным значениям |
| 2.7 | 2026-03-15 | Явно запрещён синтез данных в `event_logs`; интеграторы не должны придумывать серийные номера компонентов, если источник их не отдал | | 2.7 | 2026-03-15 | Явно запрещён синтез данных в `event_logs`; интеграторы не должны придумывать серийные номера компонентов, если источник их не отдал |
| 2.6 | 2026-03-15 | Добавлена необязательная секция `event_logs` для dedup/upsert логов `host` / `bmc` / `redfish` вне history timeline | | 2.6 | 2026-03-15 | Добавлена необязательная секция `event_logs` для dedup/upsert логов `host` / `bmc` / `redfish` вне history timeline |
| 2.5 | 2026-03-15 | Добавлено общее необязательное поле `manufactured_year_week` для компонентных секций (`YYYY-Www`) | | 2.5 | 2026-03-15 | Добавлено общее необязательное поле `manufactured_year_week` для компонентных секций (`YYYY-Www`) |
@@ -131,8 +134,9 @@ GET /ingest/hardware/jobs/{job_id}
"storage": [ ... ], "storage": [ ... ],
"pcie_devices": [ ... ], "pcie_devices": [ ... ],
"power_supplies": [ ... ], "power_supplies": [ ... ],
"sensors": { ... }, "sensors": { ... },
"event_logs": [ ... ] "event_logs": [ ... ],
"platform_config": { ... }
} }
} }
``` ```
@@ -343,6 +347,9 @@ GET /ingest/hardware/jobs/{job_id}
| `type` | string | нет | Тип: `NVMe`, `SSD`, `HDD` | | `type` | string | нет | Тип: `NVMe`, `SSD`, `HDD` |
| `interface` | string | нет | Интерфейс: `NVMe`, `SATA`, `SAS` | | `interface` | string | нет | Интерфейс: `NVMe`, `SATA`, `SAS` |
| `size_gb` | int | нет | Размер в ГБ | | `size_gb` | int | нет | Размер в ГБ |
| `logical_block_size_bytes` | int64 | нет | Логический размер пользовательского блока данных, например `512` или `4096` |
| `physical_block_size_bytes` | int64 | нет | Физический размер блока, если известен, например `4096` |
| `metadata_bytes_per_block` | int64 | нет | Metadata / protection bytes на логический блок, например `0` или `8` |
| `temperature_c` | float | нет | Температура накопителя, °C (telemetry) | | `temperature_c` | float | нет | Температура накопителя, °C (telemetry) |
| `power_on_hours` | int64 | нет | Время работы, часы | | `power_on_hours` | int64 | нет | Время работы, часы |
| `power_cycles` | int64 | нет | Количество циклов питания | | `power_cycles` | int64 | нет | Количество циклов питания |
@@ -363,6 +370,11 @@ GET /ingest/hardware/jobs/{job_id}
Диск без `serial_number` игнорируется. Изменение `firmware` создаёт событие `FIRMWARE_CHANGED`. Диск без `serial_number` игнорируется. Изменение `firmware` создаёт событие `FIRMWARE_CHANGED`.
Формат вида `512+8` в контракт не добавляется отдельным строковым полем. Если источник знает такую форму, он должен передавать её как:
- `logical_block_size_bytes = 512`
- `metadata_bytes_per_block = 8`
- `physical_block_size_bytes = 512` или `4096`, если известен физический размер блока
```json ```json
"storage": [ "storage": [
{ {
@@ -370,6 +382,9 @@ GET /ingest/hardware/jobs/{job_id}
"type": "NVMe", "type": "NVMe",
"model": "INTEL SSDPF2KX076T1", "model": "INTEL SSDPF2KX076T1",
"size_gb": 7680, "size_gb": 7680,
"logical_block_size_bytes": 512,
"physical_block_size_bytes": 4096,
"metadata_bytes_per_block": 8,
"temperature_c": 38.5, "temperature_c": 38.5,
"power_on_hours": 12450, "power_on_hours": 12450,
"unsafe_shutdowns": 3, "unsafe_shutdowns": 3,
@@ -592,7 +607,6 @@ PSU без `serial_number` игнорируется.
| Поле | Тип | Обязательно | Описание | | Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------| |------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора в рамках секции | | `name` | string | **да** | Уникальное имя сенсора в рамках секции |
| `location` | string | нет | Физическое расположение |
| `rpm` | int | нет | Обороты, RPM | | `rpm` | int | нет | Обороты, RPM |
| `status` | string | нет | Статус: `OK`, `Warning`, `Critical`, `Unknown` | | `status` | string | нет | Статус: `OK`, `Warning`, `Critical`, `Unknown` |
@@ -601,7 +615,6 @@ PSU без `serial_number` игнорируется.
| Поле | Тип | Обязательно | Описание | | Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------| |------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора | | `name` | string | **да** | Уникальное имя сенсора |
| `location` | string | нет | Физическое расположение |
| `voltage_v` | float | нет | Напряжение, В | | `voltage_v` | float | нет | Напряжение, В |
| `current_a` | float | нет | Ток, А | | `current_a` | float | нет | Ток, А |
| `power_w` | float | нет | Мощность, Вт | | `power_w` | float | нет | Мощность, Вт |
@@ -612,7 +625,6 @@ PSU без `serial_number` игнорируется.
| Поле | Тип | Обязательно | Описание | | Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------| |------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора | | `name` | string | **да** | Уникальное имя сенсора |
| `location` | string | нет | Физическое расположение |
| `celsius` | float | нет | Температура, °C | | `celsius` | float | нет | Температура, °C |
| `threshold_warning_celsius` | float | нет | Порог Warning, °C | | `threshold_warning_celsius` | float | нет | Порог Warning, °C |
| `threshold_critical_celsius` | float | нет | Порог Critical, °C | | `threshold_critical_celsius` | float | нет | Порог Critical, °C |
@@ -623,29 +635,29 @@ PSU без `serial_number` игнорируется.
| Поле | Тип | Обязательно | Описание | | Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------| |------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора | | `name` | string | **да** | Уникальное имя сенсора |
| `location` | string | нет | Физическое расположение |
| `value` | float | нет | Значение | | `value` | float | нет | Значение |
| `unit` | string | нет | Единица измерения | | `unit` | string | нет | Единица измерения |
| `status` | string | нет | Статус | | `status` | string | нет | Статус |
**Правила sensors:** **Правила sensors:**
- Идентификатор сенсора: пара `(sensor_type, name)`. Дубли в одном payload — берётся первое вхождение. - Идентификатор сенсора: пара `(sensor_type, name)`. Дубли в одном payload — берётся первое вхождение.
- `location` для сенсоров передавать не нужно и не следует: в Reanimator location/slot используется только для проверки перемещения и установки компонентов, а не для last-known-value sensor ingest.
- Сенсоры без `name` игнорируются. - Сенсоры без `name` игнорируются.
- При каждом импорте значения перезаписываются (upsert по ключу). - При каждом импорте значения перезаписываются (upsert по ключу).
```json ```json
"sensors": { "sensors": {
"fans": [ "fans": [
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" }, { "name": "FAN1", "rpm": 4200, "status": "OK" },
{ "name": "FAN_CPU0", "location": "CPU0", "rpm": 5600, "status": "OK" } { "name": "FAN_CPU0", "rpm": 5600, "status": "OK" }
], ],
"power": [ "power": [
{ "name": "12V Rail", "location": "Mainboard", "voltage_v": 12.06, "status": "OK" }, { "name": "12V Rail", "voltage_v": 12.06, "status": "OK" },
{ "name": "PSU0 Input", "location": "PSU0", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" } { "name": "PSU0 Input", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
], ],
"temperatures": [ "temperatures": [
{ "name": "CPU0 Temp", "location": "CPU0", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" }, { "name": "CPU0 Temp", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
{ "name": "Inlet Temp", "location": "Front", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" } { "name": "Inlet Temp", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
], ],
"other": [ "other": [
{ "name": "System Humidity", "value": 38.5, "unit": "%", "status": "OK" } { "name": "System Humidity", "value": 38.5, "unit": "%", "status": "OK" }
@@ -655,6 +667,31 @@ PSU без `serial_number` игнорируется.
--- ---
## Секция platform_config
Необязательный объект с произвольными ключами — настройки платформы как есть из источника (BIOS, Redfish, IPMI).
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `platform_config` | object | нет | Произвольный объект: ключи — строки, значения — строки, числа, булевы |
**Правила platform_config:**
- Содержимое объекта не валидируется: передавайте параметры как есть.
- При каждом импорте хранится latest-snapshot per machine; история изменений по каждому ключу накапливается отдельно.
- Если секция отсутствует или равна `null` — данные платформы не обновляются.
```json
"platform_config": {
"SecureBoot": "Enabled",
"BiosVersion": "06.08.05",
"TpmEnabled": true,
"NumaEnabled": false,
"HyperThreading": "Enabled"
}
```
---
## Обработка статусов компонентов ## Обработка статусов компонентов
| Статус | Поведение | | Статус | Поведение |
@@ -787,6 +824,12 @@ PSU без `serial_number` игнорируется.
"other": [ "other": [
{ "name": "System Humidity", "value": 38.5, "unit": "%" } { "name": "System Humidity", "value": 38.5, "unit": "%" }
] ]
},
"platform_config": {
"SecureBoot": "Enabled",
"BiosVersion": "06.08.05",
"TpmEnabled": true,
"HyperThreading": "Enabled"
} }
} }
} }
+1
View File
@@ -867,6 +867,7 @@ needs_full_build() {
"${BUILDER_DIR}/config/package-lists" \ "${BUILDER_DIR}/config/package-lists" \
"${BUILDER_DIR}/config/hooks" \ "${BUILDER_DIR}/config/hooks" \
"${BUILDER_DIR}/config/archives" \ "${BUILDER_DIR}/config/archives" \
"${BUILDER_DIR}/config/bootloaders" \
-newer "${FULL_BUILD_MARKER}" 2>/dev/null | head -1) -newer "${FULL_BUILD_MARKER}" 2>/dev/null | head -1)
if [ -n "$_heavy" ]; then if [ -n "$_heavy" ]; then
@@ -31,6 +31,7 @@ systemctl enable bee-preflight.service
systemctl enable bee-audit.service systemctl enable bee-audit.service
systemctl enable bee-web.service systemctl enable bee-web.service
systemctl enable bee-sshsetup.service systemctl enable bee-sshsetup.service
systemctl enable bee-blackbox.service
systemctl enable bee-selfheal.timer systemctl enable bee-selfheal.timer
systemctl enable bee-boot-status.service systemctl enable bee-boot-status.service
systemctl enable ssh.service systemctl enable ssh.service
@@ -47,18 +47,27 @@ vim-tiny
mc mc
htop htop
nvtop nvtop
btop
sudo sudo
zstd zstd
mstflint mstflint
memtester memtester
stress-ng stress-ng
stressapptest stressapptest
fio
# QR codes (for displaying audit results) iperf3
qrencode iotop
nload
tcpdump
hdparm
sysstat
lsscsi
sg3-utils
jq
curl
net-tools
# Local desktop (openbox + chromium kiosk) # Local desktop (openbox + chromium kiosk)
gparted
openbox openbox
tint2 tint2
feh feh
Vendored Executable
BIN
View File
Binary file not shown.
Vendored Executable
BIN
View File
Binary file not shown.
Vendored Executable
BIN
View File
Binary file not shown.
Vendored Executable
BIN
View File
Binary file not shown.
Vendored Executable
BIN
View File
Binary file not shown.
-74
View File
@@ -1,74 +0,0 @@
#!/bin/sh
# fetch-vendor.sh — download proprietary vendor utilities into iso/vendor.
#
# Usage:
# STORCLI_URL=... STORCLI_SHA256=... \
# SAS2IRCU_URL=... SAS2IRCU_SHA256=... \
# SAS3IRCU_URL=... SAS3IRCU_SHA256=... \
# MSTFLINT_URL=... MSTFLINT_SHA256=... \
# sh scripts/fetch-vendor.sh
set -eu
ROOT_DIR=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)
OUT_DIR="$ROOT_DIR/iso/vendor"
mkdir -p "$OUT_DIR"
need_cmd() {
command -v "$1" >/dev/null 2>&1 || { echo "ERROR: required command not found: $1" >&2; exit 1; }
}
need_cmd sha256sum
download_to() {
url="$1"
out="$2"
if command -v wget >/dev/null 2>&1; then
wget -O "$out" "$url"
return 0
fi
if command -v curl >/dev/null 2>&1; then
curl -fsSL "$url" -o "$out"
return 0
fi
echo "ERROR: required command not found: wget or curl" >&2
exit 1
}
fetch_one() {
name="$1"
url="$2"
sha="$3"
if [ -z "$url" ] || [ -z "$sha" ]; then
echo "[vendor] skip $name (URL/SHA not provided)"
return 0
fi
dst="$OUT_DIR/$name"
tmp="$dst.tmp"
echo "[vendor] downloading $name"
download_to "$url" "$tmp"
got=$(sha256sum "$tmp" | awk '{print $1}')
want=$(echo "$sha" | tr '[:upper:]' '[:lower:]')
if [ "$got" != "$want" ]; then
rm -f "$tmp"
echo "ERROR: checksum mismatch for $name" >&2
echo " got: $got" >&2
echo " want: $want" >&2
exit 1
fi
mv "$tmp" "$dst"
chmod +x "$dst" || true
echo "[vendor] ok: $name"
}
fetch_one "storcli64" "${STORCLI_URL:-}" "${STORCLI_SHA256:-}"
fetch_one "sas2ircu" "${SAS2IRCU_URL:-}" "${SAS2IRCU_SHA256:-}"
fetch_one "sas3ircu" "${SAS3IRCU_URL:-}" "${SAS3IRCU_SHA256:-}"
fetch_one "mstflint" "${MSTFLINT_URL:-}" "${MSTFLINT_SHA256:-}"
echo "[vendor] done. output dir: $OUT_DIR"