Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
58d6da0e4f | ||
|
|
7ce73e34a4 | ||
|
|
8a21809ade | ||
|
|
626763e31d | ||
|
|
0b8a2ff83f | ||
|
|
2c22b01fe3 | ||
|
|
ec89616585 | ||
|
|
c0dbbf96ad | ||
|
|
76484b123c | ||
|
|
8901596152 | ||
|
|
7c504e5056 | ||
|
|
333c44f3ba | ||
|
|
3bca821d3e | ||
|
|
3648e37a1e | ||
|
|
d109e08fab | ||
|
|
11d00b9442 | ||
|
|
6defa5ae15 | ||
|
|
c76658ed00 | ||
|
|
2163017a98 | ||
| 29179917c3 |
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -67,6 +68,8 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||
return runSupportBundle(args[1:], stdout, stderr)
|
||||
case "web":
|
||||
return runWeb(args[1:], stdout, stderr)
|
||||
case "blackbox":
|
||||
return runBlackbox(args[1:], stdout, stderr)
|
||||
case "sat":
|
||||
return runSAT(args[1:], stdout, stderr)
|
||||
case "benchmark":
|
||||
@@ -90,6 +93,7 @@ func printRootUsage(w io.Writer) {
|
||||
bee export --target <device>
|
||||
bee support-bundle --output stdout|file:<path>
|
||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||
bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
|
||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||
bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
|
||||
@@ -109,6 +113,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||
case "web":
|
||||
return runWeb([]string{"--help"}, stdout, stdout)
|
||||
case "blackbox":
|
||||
return runBlackbox([]string{"--help"}, stdout, stdout)
|
||||
case "sat":
|
||||
return runSAT([]string{"--help"}, stdout, stderr)
|
||||
case "benchmark":
|
||||
@@ -340,6 +346,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBlackbox(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||
statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
|
||||
if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
|
||||
slog.Error("run blackbox", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||
|
||||
779
audit/internal/app/blackbox.go
Normal file
779
audit/internal/app/blackbox.go
Normal file
@@ -0,0 +1,779 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
const (
|
||||
blackboxMarkerName = ".bee-blackbox"
|
||||
blackboxDiscoverInterval = 2 * time.Second
|
||||
blackboxMinFlushPeriod = 1 * time.Second
|
||||
blackboxMaxFlushPeriod = 30 * time.Second
|
||||
blackboxRecoveryFastCount = 5
|
||||
)
|
||||
|
||||
var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
|
||||
|
||||
var (
|
||||
blackboxExecCommand = exec.Command
|
||||
blackboxNow = func() time.Time { return time.Now().UTC() }
|
||||
)
|
||||
|
||||
type BlackboxMarker struct {
|
||||
Version int `json:"version"`
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
CreatedAtUTC string `json:"created_at_utc"`
|
||||
Host string `json:"host,omitempty"`
|
||||
}
|
||||
|
||||
type BlackboxTargetStatus struct {
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
Device string `json:"device"`
|
||||
FS platform.RemovableTarget `json:"fs"`
|
||||
BootFolder string `json:"boot_folder"`
|
||||
Status string `json:"status"`
|
||||
LastSyncAtUTC string `json:"last_sync_at_utc,omitempty"`
|
||||
LastCycleDuration string `json:"last_cycle_duration,omitempty"`
|
||||
FlushPeriod string `json:"flush_period"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
Mountpoint string `json:"mountpoint,omitempty"`
|
||||
}
|
||||
|
||||
type BlackboxState struct {
|
||||
Status string `json:"status"`
|
||||
BootStartedAtUTC string `json:"boot_started_at_utc"`
|
||||
BootFolder string `json:"boot_folder"`
|
||||
UpdatedAtUTC string `json:"updated_at_utc"`
|
||||
Targets []BlackboxTargetStatus `json:"targets"`
|
||||
}
|
||||
|
||||
type blackboxRuntime struct {
|
||||
exportDir string
|
||||
statePath string
|
||||
system *platform.System
|
||||
bootStarted time.Time
|
||||
bootFolder string
|
||||
|
||||
mu sync.Mutex
|
||||
workers map[string]*blackboxWorker
|
||||
}
|
||||
|
||||
type discoveredBlackboxTarget struct {
|
||||
marker BlackboxMarker
|
||||
target platform.RemovableTarget
|
||||
seenMount string
|
||||
mountedByBee bool
|
||||
}
|
||||
|
||||
type blackboxWorker struct {
|
||||
runtime *blackboxRuntime
|
||||
enrollmentID string
|
||||
|
||||
mu sync.Mutex
|
||||
target platform.RemovableTarget
|
||||
marker BlackboxMarker
|
||||
mountpoint string
|
||||
mountedByBee bool
|
||||
status string
|
||||
lastSyncAt time.Time
|
||||
lastDuration time.Duration
|
||||
flushPeriod time.Duration
|
||||
lastError string
|
||||
fastCycles int
|
||||
stopCh chan struct{}
|
||||
stoppedCh chan struct{}
|
||||
}
|
||||
|
||||
func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
if exportDir == "" {
|
||||
exportDir = DefaultExportDir
|
||||
}
|
||||
statePath = strings.TrimSpace(statePath)
|
||||
if statePath == "" {
|
||||
statePath = DefaultBlackboxStatePath
|
||||
}
|
||||
if system == nil {
|
||||
system = platform.New()
|
||||
}
|
||||
bootStarted, err := bootStartedAtUTC()
|
||||
if err != nil {
|
||||
bootStarted = blackboxNow()
|
||||
}
|
||||
rt := &blackboxRuntime{
|
||||
exportDir: exportDir,
|
||||
statePath: statePath,
|
||||
system: system,
|
||||
bootStarted: bootStarted,
|
||||
bootFolder: SupportBundleBaseName(bootStarted),
|
||||
workers: make(map[string]*blackboxWorker),
|
||||
}
|
||||
_ = os.MkdirAll(filepath.Dir(statePath), 0755)
|
||||
rt.persistState()
|
||||
ticker := time.NewTicker(blackboxDiscoverInterval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
rt.reconcile()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
rt.stopAll()
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func ReadBlackboxState(path string) (BlackboxState, error) {
|
||||
path = strings.TrimSpace(path)
|
||||
if path == "" {
|
||||
path = DefaultBlackboxStatePath
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return BlackboxState{}, err
|
||||
}
|
||||
var state BlackboxState
|
||||
if err := json.Unmarshal(raw, &state); err != nil {
|
||||
return BlackboxState{}, err
|
||||
}
|
||||
return state, nil
|
||||
}
|
||||
|
||||
func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
if target.Device == "" {
|
||||
return BlackboxMarker{}, fmt.Errorf("device is required")
|
||||
}
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
|
||||
if err != nil {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
defer func() {
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
}()
|
||||
|
||||
marker, _, err := readBlackboxMarker(mountpoint)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
if marker.EnrollmentID == "" {
|
||||
marker = BlackboxMarker{
|
||||
Version: 1,
|
||||
EnrollmentID: newBlackboxEnrollmentID(),
|
||||
CreatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||
Host: hostnameOr("unknown"),
|
||||
}
|
||||
}
|
||||
if err := writeBlackboxMarker(mountpoint, marker); err != nil {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
return marker, nil
|
||||
}
|
||||
|
||||
func DisableBlackboxTarget(device, enrollmentID string) error {
|
||||
device = strings.TrimSpace(device)
|
||||
enrollmentID = strings.TrimSpace(enrollmentID)
|
||||
if device == "" && enrollmentID == "" {
|
||||
return fmt.Errorf("device or enrollment_id is required")
|
||||
}
|
||||
system := platform.New()
|
||||
targets, err := system.ListRemovableTargets()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, target := range targets {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
|
||||
if mountErr != nil {
|
||||
continue
|
||||
}
|
||||
remove := false
|
||||
marker, _, err := readBlackboxMarker(mountpoint)
|
||||
if err == nil {
|
||||
if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
|
||||
remove = true
|
||||
}
|
||||
if device != "" && target.Device == device {
|
||||
remove = true
|
||||
}
|
||||
}
|
||||
if remove {
|
||||
err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
|
||||
}
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
if remove {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return os.ErrNotExist
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) reconcile() {
|
||||
discovered, _ := rt.discoverMarkedTargets()
|
||||
|
||||
rt.mu.Lock()
|
||||
defer rt.mu.Unlock()
|
||||
|
||||
seen := make(map[string]struct{}, len(discovered))
|
||||
for _, found := range discovered {
|
||||
seen[found.marker.EnrollmentID] = struct{}{}
|
||||
worker, ok := rt.workers[found.marker.EnrollmentID]
|
||||
if !ok {
|
||||
worker = newBlackboxWorker(rt, found)
|
||||
rt.workers[found.marker.EnrollmentID] = worker
|
||||
go worker.run()
|
||||
continue
|
||||
}
|
||||
worker.update(found)
|
||||
}
|
||||
for id, worker := range rt.workers {
|
||||
if _, ok := seen[id]; ok {
|
||||
continue
|
||||
}
|
||||
worker.stop()
|
||||
delete(rt.workers, id)
|
||||
}
|
||||
rt.persistStateLocked()
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) stopAll() {
|
||||
rt.mu.Lock()
|
||||
workers := make([]*blackboxWorker, 0, len(rt.workers))
|
||||
for _, worker := range rt.workers {
|
||||
workers = append(workers, worker)
|
||||
}
|
||||
rt.workers = map[string]*blackboxWorker{}
|
||||
rt.persistStateLocked()
|
||||
rt.mu.Unlock()
|
||||
for _, worker := range workers {
|
||||
worker.stop()
|
||||
}
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
|
||||
targets, err := rt.system.ListRemovableTargets()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out []discoveredBlackboxTarget
|
||||
for _, rawTarget := range targets {
|
||||
target := sanitizeRemovableTarget(rawTarget)
|
||||
if target.Device == "" {
|
||||
continue
|
||||
}
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
marker, ok, err := readBlackboxMarker(mountpoint)
|
||||
if mountedByBee && !ok {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
if err != nil || !ok || marker.EnrollmentID == "" {
|
||||
continue
|
||||
}
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
out = append(out, discoveredBlackboxTarget{
|
||||
marker: marker,
|
||||
target: target,
|
||||
seenMount: mountpoint,
|
||||
mountedByBee: mountedByBee,
|
||||
})
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool {
|
||||
return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
|
||||
})
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
|
||||
return &blackboxWorker{
|
||||
runtime: rt,
|
||||
enrollmentID: found.marker.EnrollmentID,
|
||||
target: found.target,
|
||||
marker: found.marker,
|
||||
flushPeriod: blackboxMinFlushPeriod,
|
||||
status: "running",
|
||||
stopCh: make(chan struct{}),
|
||||
stoppedCh: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) run() {
|
||||
defer close(w.stoppedCh)
|
||||
for {
|
||||
start := time.Now()
|
||||
err := w.syncCycle()
|
||||
duration := time.Since(start)
|
||||
w.finishCycle(duration, err)
|
||||
|
||||
wait := w.currentFlushPeriod()
|
||||
timer := time.NewTimer(wait)
|
||||
select {
|
||||
case <-w.stopCh:
|
||||
timer.Stop()
|
||||
w.cleanup()
|
||||
return
|
||||
case <-timer.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.target = found.target
|
||||
w.marker = found.marker
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) stop() {
|
||||
select {
|
||||
case <-w.stopCh:
|
||||
default:
|
||||
close(w.stopCh)
|
||||
}
|
||||
<-w.stoppedCh
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.flushPeriod
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.lastDuration = duration
|
||||
if err != nil {
|
||||
w.status = "degraded"
|
||||
w.lastError = err.Error()
|
||||
w.fastCycles = 0
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
|
||||
} else {
|
||||
w.status = "running"
|
||||
w.lastSyncAt = blackboxNow()
|
||||
w.lastError = ""
|
||||
if duration <= w.flushPeriod/2 {
|
||||
w.fastCycles++
|
||||
} else {
|
||||
w.fastCycles = 0
|
||||
}
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||
}
|
||||
w.runtime.persistState()
|
||||
}
|
||||
|
||||
func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
|
||||
if current <= 0 {
|
||||
current = blackboxMinFlushPeriod
|
||||
}
|
||||
if duration <= 0 {
|
||||
duration = current
|
||||
}
|
||||
next := current
|
||||
if duration > current {
|
||||
growA := time.Duration(float64(current) * 1.25)
|
||||
growB := time.Duration(float64(duration) * 1.25)
|
||||
if growB > growA {
|
||||
next = growB
|
||||
} else {
|
||||
next = growA
|
||||
}
|
||||
}
|
||||
if success && fastCycles >= blackboxRecoveryFastCount {
|
||||
next = time.Duration(float64(current) * 0.9)
|
||||
}
|
||||
if next < blackboxMinFlushPeriod {
|
||||
next = blackboxMinFlushPeriod
|
||||
}
|
||||
if next > blackboxMaxFlushPeriod {
|
||||
next = blackboxMaxFlushPeriod
|
||||
}
|
||||
return next
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) syncCycle() error {
|
||||
target, marker := w.snapshotTarget()
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
w.recordMountpoint(mountpoint, mountedByBee)
|
||||
|
||||
root := filepath.Join(mountpoint, w.runtime.bootFolder)
|
||||
if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := w.captureSnapshots(root); err != nil {
|
||||
return err
|
||||
}
|
||||
return syncFilesystem(root)
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) cleanup() {
|
||||
w.mu.Lock()
|
||||
mountpoint := w.mountpoint
|
||||
mountedByBee := w.mountedByBee
|
||||
w.mu.Unlock()
|
||||
if mountedByBee && mountpoint != "" {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.target, w.marker
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.mountpoint = mountpoint
|
||||
w.mountedByBee = mountedByBee
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) captureSnapshots(root string) error {
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, svc := range supportBundleServices {
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) persistState() {
|
||||
rt.mu.Lock()
|
||||
defer rt.mu.Unlock()
|
||||
rt.persistStateLocked()
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) persistStateLocked() {
|
||||
state := BlackboxState{
|
||||
Status: "disabled",
|
||||
BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
|
||||
BootFolder: rt.bootFolder,
|
||||
UpdatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||
Targets: make([]BlackboxTargetStatus, 0, len(rt.workers)),
|
||||
}
|
||||
if len(rt.workers) > 0 {
|
||||
state.Status = "running"
|
||||
}
|
||||
for _, worker := range rt.workers {
|
||||
worker.mu.Lock()
|
||||
targetState := BlackboxTargetStatus{
|
||||
EnrollmentID: worker.enrollmentID,
|
||||
Device: worker.target.Device,
|
||||
FS: worker.target,
|
||||
BootFolder: rt.bootFolder,
|
||||
Status: worker.status,
|
||||
FlushPeriod: worker.flushPeriod.String(),
|
||||
LastError: worker.lastError,
|
||||
Mountpoint: worker.mountpoint,
|
||||
}
|
||||
if !worker.lastSyncAt.IsZero() {
|
||||
targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
|
||||
}
|
||||
if worker.lastDuration > 0 {
|
||||
targetState.LastCycleDuration = worker.lastDuration.String()
|
||||
}
|
||||
if worker.status == "degraded" {
|
||||
state.Status = "degraded"
|
||||
}
|
||||
worker.mu.Unlock()
|
||||
state.Targets = append(state.Targets, targetState)
|
||||
}
|
||||
sort.Slice(state.Targets, func(i, j int) bool {
|
||||
return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
|
||||
})
|
||||
_ = writeJSONAtomic(rt.statePath, state)
|
||||
}
|
||||
|
||||
func bootStartedAtUTC() (time.Time, error) {
|
||||
raw, err := os.ReadFile("/proc/stat")
|
||||
if err != nil {
|
||||
return time.Time{}, err
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "btime ") {
|
||||
continue
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
if len(parts) != 2 {
|
||||
break
|
||||
}
|
||||
sec, err := time.ParseDuration(parts[1] + "s")
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
return time.Unix(int64(sec/time.Second), 0).UTC(), nil
|
||||
}
|
||||
return time.Time{}, fmt.Errorf("boot time not found")
|
||||
}
|
||||
|
||||
func newBlackboxEnrollmentID() string {
|
||||
var buf [8]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return fmt.Sprintf("bb-%d", time.Now().UnixNano())
|
||||
}
|
||||
return "bb-" + hex.EncodeToString(buf[:])
|
||||
}
|
||||
|
||||
func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
|
||||
target.Device = strings.TrimSpace(target.Device)
|
||||
target.FSType = strings.TrimSpace(target.FSType)
|
||||
target.Size = strings.TrimSpace(target.Size)
|
||||
target.Label = strings.TrimSpace(target.Label)
|
||||
target.Model = strings.TrimSpace(target.Model)
|
||||
target.Mountpoint = strings.TrimSpace(target.Mountpoint)
|
||||
return target
|
||||
}
|
||||
|
||||
func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
if target.Mountpoint != "" {
|
||||
if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
|
||||
return target.Mountpoint, false, nil
|
||||
}
|
||||
}
|
||||
mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
return "", false, err
|
||||
}
|
||||
if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
return "", false, formatBlackboxMountTargetError(target, string(raw), err)
|
||||
}
|
||||
if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
|
||||
_ = unmountTarget(mountpoint)
|
||||
return "", false, err
|
||||
}
|
||||
return mountpoint, true, nil
|
||||
}
|
||||
|
||||
func unmountTarget(mountpoint string) error {
|
||||
_ = blackboxExecCommand("sync").Run()
|
||||
raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(raw))
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
|
||||
raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return BlackboxMarker{}, false, os.ErrNotExist
|
||||
}
|
||||
return BlackboxMarker{}, false, err
|
||||
}
|
||||
var marker BlackboxMarker
|
||||
if err := json.Unmarshal(raw, &marker); err != nil {
|
||||
return BlackboxMarker{}, false, err
|
||||
}
|
||||
return marker, true, nil
|
||||
}
|
||||
|
||||
func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
|
||||
if marker.Version == 0 {
|
||||
marker.Version = 1
|
||||
}
|
||||
return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
|
||||
}
|
||||
|
||||
func syncDirectoryTree(srcDir, dstDir string) error {
|
||||
seen := make(map[string]struct{})
|
||||
err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(srcDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel = filepath.Clean(rel)
|
||||
if rel == "." {
|
||||
seen["."] = struct{}{}
|
||||
return os.MkdirAll(dstDir, 0755)
|
||||
}
|
||||
seen[rel] = struct{}{}
|
||||
dstPath := filepath.Join(dstDir, rel)
|
||||
if d.IsDir() {
|
||||
info, err := d.Info()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.MkdirAll(dstPath, info.Mode().Perm())
|
||||
}
|
||||
return copyFileIfChanged(path, dstPath)
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return removeMissingPaths(dstDir, seen)
|
||||
}
|
||||
|
||||
func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
|
||||
return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(dstDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel = filepath.Clean(rel)
|
||||
if rel == "." {
|
||||
return nil
|
||||
}
|
||||
if _, ok := seen[rel]; ok {
|
||||
return nil
|
||||
}
|
||||
return os.RemoveAll(path)
|
||||
})
|
||||
}
|
||||
|
||||
func copyFileIfChanged(src, dst string) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return os.MkdirAll(dst, info.Mode().Perm())
|
||||
}
|
||||
srcData, err := os.ReadFile(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
|
||||
return nil
|
||||
}
|
||||
return writeFileAtomic(dst, srcData, info.Mode().Perm())
|
||||
}
|
||||
|
||||
func captureCommandAtomic(dst string, name string, args ...string) error {
|
||||
raw, err := blackboxExecCommand(name, args...).CombinedOutput()
|
||||
if len(raw) == 0 {
|
||||
if err != nil {
|
||||
raw = []byte(err.Error() + "\n")
|
||||
} else {
|
||||
raw = []byte("no output\n")
|
||||
}
|
||||
}
|
||||
return writeFileAtomic(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func writeJSONAtomic(path string, v any) error {
|
||||
raw, err := json.MarshalIndent(v, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
raw = append(raw, '\n')
|
||||
return writeFileAtomic(path, raw, 0644)
|
||||
}
|
||||
|
||||
func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
|
||||
return nil
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := f.Write(data); err != nil {
|
||||
_ = f.Close()
|
||||
return err
|
||||
}
|
||||
if err := f.Sync(); err != nil {
|
||||
_ = f.Close()
|
||||
return err
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
return err
|
||||
}
|
||||
return syncFilesystem(filepath.Dir(path))
|
||||
}
|
||||
|
||||
func syncFilesystem(path string) error {
|
||||
return blackboxExecCommand("sync").Run()
|
||||
}
|
||||
|
||||
func ensureWritableBlackboxMountpoint(mountpoint string) error {
|
||||
probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||
}
|
||||
name := probe.Name()
|
||||
if closeErr := probe.Close(); closeErr != nil {
|
||||
_ = os.Remove(name)
|
||||
return closeErr
|
||||
}
|
||||
if err := os.Remove(name); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
|
||||
msg := strings.TrimSpace(raw)
|
||||
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||
}
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
52
audit/internal/app/blackbox_test.go
Normal file
52
audit/internal/app/blackbox_test.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
|
||||
current := 2 * time.Second
|
||||
got := adjustFlushPeriod(current, 4*time.Second, false, 0)
|
||||
if got <= current {
|
||||
t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
|
||||
current := 10 * time.Second
|
||||
got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
|
||||
if got >= current {
|
||||
t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
|
||||
}
|
||||
if got < blackboxMinFlushPeriod {
|
||||
t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadBlackboxState(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "blackbox-state.json")
|
||||
want := BlackboxState{
|
||||
Status: "running",
|
||||
BootStartedAtUTC: "2026-04-24T00:00:00Z",
|
||||
BootFolder: "boot-folder",
|
||||
UpdatedAtUTC: "2026-04-24T00:00:01Z",
|
||||
Targets: []BlackboxTargetStatus{{
|
||||
EnrollmentID: "bb-1",
|
||||
Device: "/dev/sdb1",
|
||||
Status: "running",
|
||||
FlushPeriod: "1s",
|
||||
}},
|
||||
}
|
||||
if err := writeJSONAtomic(path, want); err != nil {
|
||||
t.Fatalf("writeJSONAtomic: %v", err)
|
||||
}
|
||||
got, err := ReadBlackboxState(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadBlackboxState: %v", err)
|
||||
}
|
||||
if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
|
||||
t.Fatalf("state=%+v", got)
|
||||
}
|
||||
}
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
)
|
||||
|
||||
var supportBundleServices = []string{
|
||||
"bee-blackbox.service",
|
||||
"bee-audit.service",
|
||||
"bee-web.service",
|
||||
"bee-network.service",
|
||||
@@ -256,11 +257,6 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
date := now.Format("2006-01-02")
|
||||
tod := now.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||
@@ -294,7 +290,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||
archiveName := SupportBundleBaseName(now) + ".tar.gz"
|
||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||
return "", err
|
||||
@@ -302,6 +298,16 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
func SupportBundleBaseName(at time.Time) string {
|
||||
at = at.UTC()
|
||||
date := at.Format("2006-01-02")
|
||||
tod := at.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
|
||||
}
|
||||
|
||||
func LatestSupportBundlePath() (string, error) {
|
||||
return latestSupportBundlePath(os.TempDir())
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package collector
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) {
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
var execIpmitool = func(args ...string) (string, error) {
|
||||
out, err := exec.Command("ipmitool", args...).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
||||
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
||||
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
||||
@@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
|
||||
|
||||
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
||||
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
||||
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
|
||||
func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord {
|
||||
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
||||
return nil
|
||||
}
|
||||
out, err := execIpmitool("mc", "info")
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info")
|
||||
raw, err := cmd.Output()
|
||||
if err != nil {
|
||||
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
version := parseBMCFirmwareRevision(out)
|
||||
version := parseBMCFirmwareRevision(string(raw))
|
||||
if version == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
board, biosFW := collectBoard()
|
||||
snap.Board = board
|
||||
snap.Firmware = append(snap.Firmware, biosFW...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)
|
||||
|
||||
snap.CPUs = collectCPUs()
|
||||
|
||||
@@ -34,6 +34,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
}
|
||||
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
|
||||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||
bestEffortRescanHotplugStorage()
|
||||
snap.Storage = collectStorage()
|
||||
snap.PCIeDevices = collectPCIe()
|
||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||
@@ -44,7 +45,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||||
snap.PowerSupplies = collectPSUs()
|
||||
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||||
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
92
audit/internal/collector/ipmi_profile.go
Normal file
92
audit/internal/collector/ipmi_profile.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package collector
|
||||
|
||||
// Package-level IPMI tuning profiles.
|
||||
//
|
||||
// Each profile is matched by board manufacturer (already known before PSU
|
||||
// collection runs). The profile drives two things:
|
||||
// - Per-command timeouts — prevents infinite hangs on slow BMCs.
|
||||
// - FRU early-exit — streaming parser stops reading once all PSU entries
|
||||
// are found, avoiding the tail of non-PSU FRU records.
|
||||
//
|
||||
// To add a new vendor: append to ipmiProfiles. The first matching entry wins.
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ipmiProfile holds tuning parameters for one or more board manufacturers.
|
||||
type ipmiProfile struct {
|
||||
// name is shown in log messages.
|
||||
name string
|
||||
// manufacturers is a list of lowercase substrings matched against the
|
||||
// board manufacturer string from dmidecode type 1.
|
||||
manufacturers []string
|
||||
// fruTimeout is the hard deadline for the entire `ipmitool fru print`
|
||||
// command. Zero means no timeout (not recommended).
|
||||
fruTimeout time.Duration
|
||||
// sdrTimeout is the hard deadline for `ipmitool sdr`.
|
||||
sdrTimeout time.Duration
|
||||
// mcInfoTimeout is the hard deadline for `ipmitool mc info`.
|
||||
mcInfoTimeout time.Duration
|
||||
// fruEarlyExit instructs the streaming FRU parser to stop reading
|
||||
// after it has found at least one PSU entry and the current block is
|
||||
// complete. Useful on servers with many non-PSU FRU devices.
|
||||
fruEarlyExit bool
|
||||
}
|
||||
|
||||
// ipmiProfiles is the ordered list of profiles. First match wins.
|
||||
var ipmiProfiles = []ipmiProfile{
|
||||
{
|
||||
// Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series).
|
||||
// SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the
|
||||
// full `fru print` scan takes ~108 s on a loaded BMC. Enable early
|
||||
// exit so collection stops once PSU records are found.
|
||||
name: "lenovo",
|
||||
manufacturers: []string{"lenovo"},
|
||||
fruTimeout: 90 * time.Second,
|
||||
sdrTimeout: 45 * time.Second,
|
||||
mcInfoTimeout: 15 * time.Second,
|
||||
fruEarlyExit: true,
|
||||
},
|
||||
{
|
||||
// HPE iLO-based servers (ProLiant DL/ML/BL).
|
||||
name: "hpe",
|
||||
manufacturers: []string{"hp", "hewlett packard"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
{
|
||||
// Dell iDRAC-based servers.
|
||||
name: "dell",
|
||||
manufacturers: []string{"dell"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
}
|
||||
|
||||
// defaultIPMIProfile is used when no vendor profile matches.
|
||||
var defaultIPMIProfile = ipmiProfile{
|
||||
name: "default",
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
}
|
||||
|
||||
// selectIPMIProfile returns the profile for the given board manufacturer.
|
||||
func selectIPMIProfile(manufacturer string) ipmiProfile {
|
||||
mfgLower := strings.ToLower(strings.TrimSpace(manufacturer))
|
||||
for _, p := range ipmiProfiles {
|
||||
for _, m := range p.manufacturers {
|
||||
if strings.Contains(mfgLower, m) {
|
||||
return p
|
||||
}
|
||||
}
|
||||
}
|
||||
return defaultIPMIProfile
|
||||
}
|
||||
@@ -4,7 +4,9 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@@ -140,6 +142,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
}
|
||||
if group, ok := readPCIIOMMUGroup(bdf); ok {
|
||||
dev.IOMMUGroup = &group
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||
dev.LinkWidth = &width
|
||||
}
|
||||
@@ -179,6 +184,21 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
return dev
|
||||
}
|
||||
|
||||
// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
|
||||
// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
|
||||
func readPCIIOMMUGroup(bdf string) (int, bool) {
|
||||
link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
|
||||
target, err := os.Readlink(link)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
n, err := strconv.Atoi(filepath.Base(target))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
}
|
||||
|
||||
// readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
|
||||
func readPCIIDs(bdf string) (vendorID, deviceID int) {
|
||||
base := "/sys/bus/pci/devices/" + bdf
|
||||
|
||||
@@ -2,6 +2,8 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
@@ -10,16 +12,29 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func collectPSUs() []schema.HardwarePowerSupply {
|
||||
func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
|
||||
defer fruCancel()
|
||||
|
||||
if profile.fruEarlyExit {
|
||||
psus = collectFRUEarlyExit(fruCtx)
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
|
||||
if out, err := cmd.Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
sdrData := map[int]psuSDR{}
|
||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||
sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
|
||||
defer sdrCancel()
|
||||
cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
|
||||
if sdrOut, err := cmd.Output(); err == nil {
|
||||
sdrData = parsePSUSDR(string(sdrOut))
|
||||
if len(psus) == 0 {
|
||||
psus = synthesizePSUsFromSDR(sdrData)
|
||||
@@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply {
|
||||
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
||||
return nil
|
||||
}
|
||||
slog.Info("psu: collected", "count", len(psus))
|
||||
slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
|
||||
return psus
|
||||
}
|
||||
|
||||
// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
|
||||
// as soon as it has found all PSU blocks and the next block is not a PSU.
|
||||
// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
|
||||
func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
|
||||
pipe, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
slog.Info("psu: fru pipe unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
slog.Info("psu: fru start failed", "err", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
var currentBlock strings.Builder
|
||||
slot := 0
|
||||
psuFound := false
|
||||
stoppedEarly := false
|
||||
|
||||
scanner := bufio.NewScanner(pipe)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
if strings.HasPrefix(line, "FRU Device Description") {
|
||||
if currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
psuFound = true
|
||||
slot++
|
||||
}
|
||||
currentBlock.Reset()
|
||||
}
|
||||
// Stop once we've collected PSUs and hit a non-PSU block header.
|
||||
if psuFound && !isPSUHeader(strings.ToLower(line)) {
|
||||
stoppedEarly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
currentBlock.WriteString(line)
|
||||
currentBlock.WriteByte('\n')
|
||||
}
|
||||
|
||||
if !stoppedEarly && currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
}
|
||||
}
|
||||
|
||||
// Kill the process immediately on early exit rather than waiting for context timeout.
|
||||
if cmd.Process != nil {
|
||||
cmd.Process.Kill() //nolint:errcheck
|
||||
}
|
||||
cmd.Wait() //nolint:errcheck
|
||||
slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
|
||||
return psus
|
||||
}
|
||||
|
||||
|
||||
@@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray {
|
||||
return arrays
|
||||
}
|
||||
|
||||
// collectVROCLicense runs mdadm --detail-platform and extracts the License field.
|
||||
// Returns nil when VROC is absent or the platform does not report a license.
|
||||
func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string {
|
||||
if !hasVROCController(pcie) {
|
||||
return nil
|
||||
}
|
||||
out, err := raidToolQuery("mdadm", "--detail-platform")
|
||||
if err != nil {
|
||||
slog.Info("vroc: mdadm --detail-platform unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
return parseMDAdmPlatformLicense(string(out))
|
||||
}
|
||||
|
||||
func parseMDAdmPlatformLicense(raw string) *string {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(strings.ToLower(trimmed), "license") {
|
||||
continue
|
||||
}
|
||||
if idx := strings.Index(trimmed, ":"); idx >= 0 {
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val != "" {
|
||||
v := strings.ToLower(val)
|
||||
return &v
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func queryDeviceSerial(devPath string) string {
|
||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||
var ctrl nvmeIDCtrl
|
||||
|
||||
@@ -4,12 +4,52 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
pciRescanPath = "/sys/bus/pci/rescan"
|
||||
scsiHostScanGlob = "/sys/class/scsi_host/host*/scan"
|
||||
hotplugWriteFile = os.WriteFile
|
||||
hotplugExecCommand = exec.Command
|
||||
hotplugGlob = filepath.Glob
|
||||
nvmeLBAFCompactRE = regexp.MustCompile(`(?im)^\s*lbaf\s+\d+\s*:\s*ms:(\d+)\s+lbads:(\d+).*?\(in use\)\s*$`)
|
||||
nvmeLBAFVerboseRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+\d+\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*?\(in use\)\s*$`)
|
||||
sgReadcapBlockRE = regexp.MustCompile(`(?im)logical block length\s*=\s*(\d+)\s+bytes`)
|
||||
sgReadcapProtRE = regexp.MustCompile(`(?im)prot_en\s*=\s*1`)
|
||||
)
|
||||
|
||||
func bestEffortRescanHotplugStorage() {
|
||||
if err := hotplugWriteFile(pciRescanPath, []byte("1\n"), 0644); err != nil {
|
||||
slog.Info("storage: pci rescan skipped", "path", pciRescanPath, "err", err)
|
||||
} else {
|
||||
slog.Info("storage: triggered pci rescan for hotplug discovery")
|
||||
}
|
||||
|
||||
hostPaths, err := hotplugGlob(scsiHostScanGlob)
|
||||
if err != nil {
|
||||
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
||||
} else {
|
||||
for _, path := range hostPaths {
|
||||
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
||||
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
||||
continue
|
||||
}
|
||||
slog.Info("storage: triggered scsi host scan", "path", path)
|
||||
}
|
||||
}
|
||||
|
||||
out, err := hotplugExecCommand("udevadm", "settle", "--timeout=10").CombinedOutput()
|
||||
if err != nil {
|
||||
slog.Info("storage: udev settle after hotplug rescan failed", "err", err, "output", strings.TrimSpace(string(out)))
|
||||
}
|
||||
}
|
||||
|
||||
func collectStorage() []schema.HardwareStorage {
|
||||
devs := discoverStorageDevices()
|
||||
result := make([]schema.HardwareStorage, 0, len(devs))
|
||||
@@ -35,6 +75,8 @@ type lsblkDevice struct {
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
LogSec string `json:"log-sec"`
|
||||
PhySec string `json:"phy-sec"`
|
||||
}
|
||||
|
||||
type lsblkRoot struct {
|
||||
@@ -101,7 +143,7 @@ func isVirtualHDiskModel(model string) bool {
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL,LOG-SEC,PHY-SEC").Output()
|
||||
if err != nil {
|
||||
slog.Warn("storage: lsblk failed", "err", err)
|
||||
return nil
|
||||
@@ -208,6 +250,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
present := true
|
||||
s := schema.HardwareStorage{Present: &present}
|
||||
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
||||
applyStorageBlockGeometry(&s, dev)
|
||||
|
||||
tran := strings.ToLower(dev.Tran)
|
||||
devPath := "/dev/" + dev.Name
|
||||
@@ -250,6 +293,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
|
||||
var info smartctlInfo
|
||||
var raw map[string]any
|
||||
_ = json.Unmarshal(out, &raw)
|
||||
if err := json.Unmarshal(out, &info); err == nil {
|
||||
if v := cleanDMIValue(info.ModelName); v != "" {
|
||||
s.Model = &v
|
||||
@@ -302,8 +347,11 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
value := float64(attr.Raw.Value)
|
||||
s.LifeRemainingPct = &value
|
||||
case 241:
|
||||
value := attr.Raw.Value
|
||||
value := smartLBAsToBytes(attr.Raw.Value)
|
||||
s.WrittenBytes = &value
|
||||
case 242:
|
||||
value := smartLBAsToBytes(attr.Raw.Value)
|
||||
s.ReadBytes = &value
|
||||
case 197:
|
||||
pending = attr.Raw.Value
|
||||
s.CurrentPendingSectors = &pending
|
||||
@@ -321,6 +369,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
offlineUncorrectable: uncorrectable,
|
||||
lifeRemainingPct: lifeRemaining,
|
||||
}
|
||||
applySCSISmartctlTelemetry(&s, raw, &status)
|
||||
applySCSIProtectionBlockGeometry(&s, devPath)
|
||||
setStorageHealthStatus(&s, status)
|
||||
return s
|
||||
}
|
||||
@@ -368,6 +418,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
Interface: &iface,
|
||||
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
||||
}
|
||||
applyStorageBlockGeometry(&s, dev)
|
||||
|
||||
devPath := "/dev/" + dev.Name
|
||||
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
|
||||
@@ -402,6 +453,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
}
|
||||
}
|
||||
applyNVMeBlockGeometry(&s, devPath)
|
||||
|
||||
// smart-log: wear telemetry
|
||||
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
||||
@@ -477,6 +529,251 @@ func nvmeDataUnitsToBytes(units int64) int64 {
|
||||
return units * 512000
|
||||
}
|
||||
|
||||
func smartLBAsToBytes(lbas int64) int64 {
|
||||
if lbas <= 0 {
|
||||
return 0
|
||||
}
|
||||
return lbas * 512
|
||||
}
|
||||
|
||||
func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, status *storageHealthStatus) {
|
||||
if s == nil || len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:power_on_time.hours",
|
||||
"path:accumulated_power_on_time.hours",
|
||||
"path:power_on_time.hour",
|
||||
"path:accumulated_power_on_time.hour",
|
||||
); ok && v > 0 && s.PowerOnHours == nil {
|
||||
s.PowerOnHours = &v
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:power_cycle_count",
|
||||
"path:start_stop_cycle_count",
|
||||
"path:accumulated_start_stop_cycles",
|
||||
); ok && v > 0 && s.PowerCycles == nil {
|
||||
s.PowerCycles = &v
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:scsi_grown_defect_list",
|
||||
"path:grown_defect_list",
|
||||
); ok && v > 0 && s.ReallocatedSectors == nil {
|
||||
s.ReallocatedSectors = &v
|
||||
if status != nil && status.reallocatedSectors == 0 {
|
||||
status.reallocatedSectors = v
|
||||
}
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:percentage_used_endurance_indicator",
|
||||
"path:scsi_percentage_used_endurance_indicator",
|
||||
); ok && v > 0 {
|
||||
if s.LifeUsedPct == nil {
|
||||
fv := float64(v)
|
||||
s.LifeUsedPct = &fv
|
||||
}
|
||||
if s.LifeRemainingPct == nil && v <= 100 {
|
||||
remaining := float64(100 - v)
|
||||
s.LifeRemainingPct = &remaining
|
||||
if status != nil && status.lifeRemainingPct == 0 {
|
||||
status.lifeRemainingPct = int64(remaining)
|
||||
}
|
||||
}
|
||||
}
|
||||
blockSize, hasBlockSize := firstInt64(raw,
|
||||
"path:logical_block_size",
|
||||
"path:block_size",
|
||||
"path:user_capacity.block_size",
|
||||
)
|
||||
if hasBlockSize && blockSize > 0 {
|
||||
if s.LogicalBlockSizeBytes == nil {
|
||||
s.LogicalBlockSizeBytes = &blockSize
|
||||
}
|
||||
if s.MetadataBytesPerBlock == nil {
|
||||
zero := int64(0)
|
||||
s.MetadataBytesPerBlock = &zero
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
s.Telemetry["logical_block_size_bytes"] = *s.LogicalBlockSizeBytes
|
||||
s.Telemetry["metadata_bytes_per_block"] = *s.MetadataBytesPerBlock
|
||||
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:logical_blocks_written",
|
||||
"path:total_lbas_written",
|
||||
); ok && v > 0 && s.WrittenBytes == nil {
|
||||
bytes := v * blockSize
|
||||
s.WrittenBytes = &bytes
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:logical_blocks_read",
|
||||
"path:total_lbas_read",
|
||||
); ok && v > 0 && s.ReadBytes == nil {
|
||||
bytes := v * blockSize
|
||||
s.ReadBytes = &bytes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
logical := parseStorageBytes(dev.LogSec)
|
||||
physical := parseStorageBytes(dev.PhySec)
|
||||
if logical <= 0 && physical <= 0 {
|
||||
return
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
if logical > 0 {
|
||||
s.LogicalBlockSizeBytes = &logical
|
||||
s.Telemetry["logical_block_size_bytes"] = logical
|
||||
if s.MetadataBytesPerBlock == nil {
|
||||
zero := int64(0)
|
||||
s.MetadataBytesPerBlock = &zero
|
||||
s.Telemetry["metadata_bytes_per_block"] = zero
|
||||
}
|
||||
}
|
||||
if physical > 0 {
|
||||
s.PhysicalBlockSizeBytes = &physical
|
||||
s.Telemetry["physical_block_size_bytes"] = physical
|
||||
}
|
||||
if s.LogicalBlockSizeBytes != nil && s.MetadataBytesPerBlock != nil {
|
||||
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
|
||||
}
|
||||
}
|
||||
|
||||
func applyNVMeBlockGeometry(s *schema.HardwareStorage, devPath string) {
|
||||
if s == nil || strings.TrimSpace(devPath) == "" {
|
||||
return
|
||||
}
|
||||
out, err := exec.Command("nvme", "id-ns", devPath, "-H").CombinedOutput()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(string(out))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
setStorageBlockGeometry(s, dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func applySCSIProtectionBlockGeometry(s *schema.HardwareStorage, devPath string) {
|
||||
if s == nil || strings.TrimSpace(devPath) == "" {
|
||||
return
|
||||
}
|
||||
out, err := exec.Command("sg_readcap", "-l", devPath).CombinedOutput()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(string(out))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
setStorageBlockGeometry(s, dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func setStorageBlockGeometry(s *schema.HardwareStorage, dataBytes, metadataBytes int64) {
|
||||
if s == nil || dataBytes <= 0 || metadataBytes < 0 {
|
||||
return
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
s.LogicalBlockSizeBytes = &dataBytes
|
||||
s.MetadataBytesPerBlock = &metadataBytes
|
||||
s.Telemetry["logical_block_size_bytes"] = dataBytes
|
||||
s.Telemetry["metadata_bytes_per_block"] = metadataBytes
|
||||
s.Telemetry["block_format"] = formatBlockFormat(dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func formatBlockFormat(dataBytes, metadataBytes int64) string {
|
||||
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||
}
|
||||
|
||||
func parseNVMeBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
|
||||
if m := nvmeLBAFCompactRE.FindStringSubmatch(raw); len(m) == 3 {
|
||||
ms, errMS := strconv.ParseInt(m[1], 10, 64)
|
||||
lbads, errLBADS := strconv.ParseInt(m[2], 10, 64)
|
||||
if errMS == nil && errLBADS == nil && lbads >= 0 && lbads < 63 {
|
||||
return 1 << lbads, ms, true
|
||||
}
|
||||
}
|
||||
if m := nvmeLBAFVerboseRE.FindStringSubmatch(raw); len(m) == 3 {
|
||||
ms, errMS := strconv.ParseInt(m[1], 10, 64)
|
||||
ds, errDS := strconv.ParseInt(m[2], 10, 64)
|
||||
if errMS == nil && errDS == nil && ds > 0 {
|
||||
return ds, ms, true
|
||||
}
|
||||
}
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
func parseSCSIBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
|
||||
m := sgReadcapBlockRE.FindStringSubmatch(raw)
|
||||
if len(m) != 2 {
|
||||
return 0, 0, false
|
||||
}
|
||||
blockBytes, err := strconv.ParseInt(m[1], 10, 64)
|
||||
if err != nil || blockBytes <= 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
if sgReadcapProtRE.MatchString(raw) {
|
||||
return blockBytes, 8, true
|
||||
}
|
||||
return blockBytes, 0, true
|
||||
}
|
||||
|
||||
func firstInt64(root map[string]any, candidates ...string) (int64, bool) {
|
||||
for _, candidate := range candidates {
|
||||
if !strings.HasPrefix(candidate, "path:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimPrefix(candidate, "path:")
|
||||
if v, ok := nestedInt64(root, strings.Split(path, ".")); ok {
|
||||
return v, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func nestedInt64(root map[string]any, path []string) (int64, bool) {
|
||||
var current any = root
|
||||
for _, key := range path {
|
||||
obj, ok := current.(map[string]any)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
current, ok = obj[key]
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
switch v := current.(type) {
|
||||
case float64:
|
||||
return int64(v), true
|
||||
case float32:
|
||||
return int64(v), true
|
||||
case int:
|
||||
return int64(v), true
|
||||
case int64:
|
||||
return v, true
|
||||
case int32:
|
||||
return int64(v), true
|
||||
case json.Number:
|
||||
n, err := v.Int64()
|
||||
return n, err == nil
|
||||
case string:
|
||||
n, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64)
|
||||
return n, err == nil
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
type storageHealthStatus struct {
|
||||
hasOverall bool
|
||||
overallPassed bool
|
||||
|
||||
69
audit/internal/collector/storage_block_format_test.go
Normal file
69
audit/internal/collector/storage_block_format_test.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseNVMeBlockFormatCompact(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseNVMeBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 0 {
|
||||
t.Fatalf("got %d+%d want 512+0", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVMeBlockFormatVerbose(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
LBA Format 0 : Metadata Size: 8 bytes - Data Size: 512 bytes - Relative Performance: 0 Better (in use)
|
||||
LBA Format 1 : Metadata Size: 0 bytes - Data Size: 4096 bytes - Relative Performance: 1 Best
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseNVMeBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 8 {
|
||||
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSCSIBlockFormatWithProtection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
Read Capacity results:
|
||||
Protection: prot_en=1, p_type=1, p_i_exponent=0
|
||||
Logical block length=512 bytes
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseSCSIBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 8 {
|
||||
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSCSIBlockFormatWithoutProtection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
Read Capacity results:
|
||||
Protection: prot_en=0, p_type=0, p_i_exponent=0
|
||||
Logical block length=4096 bytes
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseSCSIBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 4096 || metadataBytes != 0 {
|
||||
t.Fatalf("got %d+%d want 4096+0", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,12 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -31,3 +37,82 @@ func TestParseStorageBytes(t *testing.T) {
|
||||
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
rescanPath := filepath.Join(tmp, "pci-rescan")
|
||||
scanDir := filepath.Join(tmp, "scsi_host")
|
||||
host0Path := filepath.Join(scanDir, "host0", "scan")
|
||||
host1Path := filepath.Join(scanDir, "host1", "scan")
|
||||
argsPath := filepath.Join(tmp, "udevadm-args")
|
||||
toolPath := filepath.Join(tmp, "udevadm")
|
||||
if err := os.MkdirAll(filepath.Dir(host0Path), 0755); err != nil {
|
||||
t.Fatalf("mkdir host0: %v", err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(host1Path), 0755); err != nil {
|
||||
t.Fatalf("mkdir host1: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(host0Path, nil, 0644); err != nil {
|
||||
t.Fatalf("touch host0 scan: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(host1Path, nil, 0644); err != nil {
|
||||
t.Fatalf("touch host1 scan: %v", err)
|
||||
}
|
||||
script := "#!/bin/sh\nprintf '%s' \"$*\" > \"" + argsPath + "\"\n"
|
||||
if err := os.WriteFile(toolPath, []byte(script), 0755); err != nil {
|
||||
t.Fatalf("write udevadm stub: %v", err)
|
||||
}
|
||||
|
||||
oldPath := os.Getenv("PATH")
|
||||
if err := os.Setenv("PATH", tmp+string(os.PathListSeparator)+oldPath); err != nil {
|
||||
t.Fatalf("set PATH: %v", err)
|
||||
}
|
||||
defer func() { _ = os.Setenv("PATH", oldPath) }()
|
||||
|
||||
oldRescanPath := pciRescanPath
|
||||
oldSCSIGlob := scsiHostScanGlob
|
||||
oldWriteFile := hotplugWriteFile
|
||||
oldExecCommand := hotplugExecCommand
|
||||
oldGlob := hotplugGlob
|
||||
pciRescanPath = rescanPath
|
||||
scsiHostScanGlob = filepath.Join(scanDir, "host*", "scan")
|
||||
hotplugWriteFile = os.WriteFile
|
||||
hotplugExecCommand = exec.Command
|
||||
hotplugGlob = filepath.Glob
|
||||
defer func() {
|
||||
pciRescanPath = oldRescanPath
|
||||
scsiHostScanGlob = oldSCSIGlob
|
||||
hotplugWriteFile = oldWriteFile
|
||||
hotplugExecCommand = oldExecCommand
|
||||
hotplugGlob = oldGlob
|
||||
}()
|
||||
|
||||
bestEffortRescanHotplugStorage()
|
||||
|
||||
raw, err := os.ReadFile(rescanPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read rescan file: %v", err)
|
||||
}
|
||||
if string(raw) != "1\n" {
|
||||
t.Fatalf("rescan payload=%q want %q", string(raw), "1\n")
|
||||
}
|
||||
for _, path := range []string{host0Path, host1Path} {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read scsi scan file %s: %v", path, err)
|
||||
}
|
||||
if string(raw) != "- - -\n" {
|
||||
t.Fatalf("scsi scan payload at %s =%q want %q", path, string(raw), "- - -\n")
|
||||
}
|
||||
}
|
||||
|
||||
args, err := os.ReadFile(argsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read udevadm args: %v", err)
|
||||
}
|
||||
if got := strings.TrimSpace(string(args)); got != "settle --timeout=10" {
|
||||
t.Fatalf("udevadm args=%q want %q", got, "settle --timeout=10")
|
||||
}
|
||||
}
|
||||
|
||||
101
audit/internal/collector/storage_scsi_test.go
Normal file
101
audit/internal/collector/storage_scsi_test.go
Normal file
@@ -0,0 +1,101 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestApplySCSISmartctlTelemetry(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := map[string]any{
|
||||
"power_on_time": map[string]any{
|
||||
"hours": float64(32123),
|
||||
},
|
||||
"accumulated_start_stop_cycles": float64(17),
|
||||
"scsi_grown_defect_list": float64(4),
|
||||
"percentage_used_endurance_indicator": float64(12),
|
||||
"logical_block_size": float64(4096),
|
||||
"logical_blocks_written": float64(1000),
|
||||
"logical_blocks_read": float64(2000),
|
||||
}
|
||||
|
||||
var disk schema.HardwareStorage
|
||||
status := storageHealthStatus{}
|
||||
applySCSISmartctlTelemetry(&disk, raw, &status)
|
||||
|
||||
if disk.PowerOnHours == nil || *disk.PowerOnHours != 32123 {
|
||||
t.Fatalf("power_on_hours=%v want 32123", disk.PowerOnHours)
|
||||
}
|
||||
if disk.PowerCycles == nil || *disk.PowerCycles != 17 {
|
||||
t.Fatalf("power_cycles=%v want 17", disk.PowerCycles)
|
||||
}
|
||||
if disk.ReallocatedSectors == nil || *disk.ReallocatedSectors != 4 {
|
||||
t.Fatalf("reallocated=%v want 4", disk.ReallocatedSectors)
|
||||
}
|
||||
if disk.WrittenBytes == nil || *disk.WrittenBytes != 4096000 {
|
||||
t.Fatalf("written_bytes=%v want 4096000", disk.WrittenBytes)
|
||||
}
|
||||
if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 {
|
||||
t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes)
|
||||
}
|
||||
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 4096 {
|
||||
t.Fatalf("logical_block_size_bytes=%v want 4096", disk.LogicalBlockSizeBytes)
|
||||
}
|
||||
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
|
||||
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
|
||||
}
|
||||
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 {
|
||||
t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct)
|
||||
}
|
||||
if disk.LifeRemainingPct == nil || *disk.LifeRemainingPct != 88 {
|
||||
t.Fatalf("life_remaining_pct=%v want 88", disk.LifeRemainingPct)
|
||||
}
|
||||
if status.reallocatedSectors != 4 {
|
||||
t.Fatalf("status.reallocated=%d want 4", status.reallocatedSectors)
|
||||
}
|
||||
if status.lifeRemainingPct != 88 {
|
||||
t.Fatalf("status.life_remaining_pct=%d want 88", status.lifeRemainingPct)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySCSISmartctlTelemetryDoesNotOverwriteExistingValues(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
powerOnHours := int64(10)
|
||||
writtenBytes := int64(20)
|
||||
lifeRemaining := 30.0
|
||||
disk := schema.HardwareStorage{
|
||||
PowerOnHours: &powerOnHours,
|
||||
WrittenBytes: &writtenBytes,
|
||||
LifeRemainingPct: &lifeRemaining,
|
||||
}
|
||||
raw := map[string]any{
|
||||
"power_on_time": map[string]any{"hours": float64(999)},
|
||||
"logical_block_size": float64(512),
|
||||
"logical_blocks_written": float64(999),
|
||||
"percentage_used_endurance_indicator": float64(50),
|
||||
}
|
||||
|
||||
applySCSISmartctlTelemetry(&disk, raw, nil)
|
||||
|
||||
if *disk.PowerOnHours != 10 {
|
||||
t.Fatalf("power_on_hours overwritten: got %d want 10", *disk.PowerOnHours)
|
||||
}
|
||||
if *disk.WrittenBytes != 20 {
|
||||
t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes)
|
||||
}
|
||||
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 512 {
|
||||
t.Fatalf("logical_block_size_bytes=%v want 512", disk.LogicalBlockSizeBytes)
|
||||
}
|
||||
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
|
||||
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
|
||||
}
|
||||
if *disk.LifeRemainingPct != 30 {
|
||||
t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct)
|
||||
}
|
||||
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 50 {
|
||||
t.Fatalf("life_used_pct=%v want 50", disk.LifeUsedPct)
|
||||
}
|
||||
}
|
||||
25
audit/internal/collector/storage_telemetry_test.go
Normal file
25
audit/internal/collector/storage_telemetry_test.go
Normal file
@@ -0,0 +1,25 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestSmartLBAsToBytes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
lbas int64
|
||||
want int64
|
||||
}{
|
||||
{name: "zero", lbas: 0, want: 0},
|
||||
{name: "single lba", lbas: 1, want: 512},
|
||||
{name: "multiple lbas", lbas: 2048, want: 1048576},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := smartLBAsToBytes(tt.lbas); got != tt.want {
|
||||
t.Fatalf("smartLBAsToBytes(%d)=%d want %d", tt.lbas, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMDAdmPlatformLicense(t *testing.T) {
|
||||
premium := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.3.0.1138
|
||||
RAID Levels : raid0 raid1 raid5 raid10
|
||||
Total Disks : 4
|
||||
License : Premium
|
||||
`
|
||||
got := parseMDAdmPlatformLicense(premium)
|
||||
if got == nil || *got != "premium" {
|
||||
t.Fatalf("expected 'premium', got %v", got)
|
||||
}
|
||||
|
||||
standard := `Platform : Intel(R) Virtual RAID on CPU
|
||||
License : Standard
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(standard)
|
||||
if got == nil || *got != "standard" {
|
||||
t.Fatalf("expected 'standard', got %v", got)
|
||||
}
|
||||
|
||||
noLicense := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.0.0
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(noLicense)
|
||||
if got != nil {
|
||||
t.Fatalf("expected nil, got %v", *got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasVROCController(t *testing.T) {
|
||||
intel := vendorIntel
|
||||
model := "Volume Management Device NVMe RAID Controller"
|
||||
|
||||
@@ -66,6 +66,7 @@ type HardwareSnapshot struct {
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
@@ -143,30 +144,33 @@ type HardwareMemory struct {
|
||||
|
||||
type HardwareStorage struct {
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
SizeGB *int `json:"size_gb,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
Interface *string `json:"interface,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
SizeGB *int `json:"size_gb,omitempty"`
|
||||
LogicalBlockSizeBytes *int64 `json:"logical_block_size_bytes,omitempty"`
|
||||
PhysicalBlockSizeBytes *int64 `json:"physical_block_size_bytes,omitempty"`
|
||||
MetadataBytesPerBlock *int64 `json:"metadata_bytes_per_block,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
Interface *string `json:"interface,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
type HardwarePCIeDevice struct {
|
||||
@@ -211,6 +215,7 @@ type HardwarePCIeDevice struct {
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
MacAddresses []string `json:"mac_addresses,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
IOMMUGroup *int `json:"iommu_group,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
|
||||
@@ -44,3 +44,57 @@ func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
|
||||
t.Fatalf("missing event_logs payload: %s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
|
||||
powerOnHours := int64(12450)
|
||||
writtenBytes := int64(9876543210)
|
||||
readBytes := int64(1234567890)
|
||||
lifeRemainingPct := 91.0
|
||||
logicalBlockSizeBytes := int64(512)
|
||||
physicalBlockSizeBytes := int64(4096)
|
||||
metadataBytesPerBlock := int64(8)
|
||||
|
||||
payload := HardwareIngestRequest{
|
||||
CollectedAt: "2026-03-15T15:00:00Z",
|
||||
Hardware: HardwareSnapshot{
|
||||
Board: HardwareBoard{SerialNumber: "SRV-001"},
|
||||
Storage: []HardwareStorage{
|
||||
{
|
||||
SerialNumber: stringPtr("DISK-001"),
|
||||
Model: stringPtr("TestDisk"),
|
||||
LogicalBlockSizeBytes: &logicalBlockSizeBytes,
|
||||
PhysicalBlockSizeBytes: &physicalBlockSizeBytes,
|
||||
MetadataBytesPerBlock: &metadataBytesPerBlock,
|
||||
PowerOnHours: &powerOnHours,
|
||||
WrittenBytes: &writtenBytes,
|
||||
ReadBytes: &readBytes,
|
||||
LifeRemainingPct: &lifeRemainingPct,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
data, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
text := string(data)
|
||||
for _, needle := range []string{
|
||||
`"storage":[{`,
|
||||
`"logical_block_size_bytes":512`,
|
||||
`"physical_block_size_bytes":4096`,
|
||||
`"metadata_bytes_per_block":8`,
|
||||
`"power_on_hours":12450`,
|
||||
`"written_bytes":9876543210`,
|
||||
`"read_bytes":1234567890`,
|
||||
`"life_remaining_pct":91`,
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("missing %q in payload: %s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func stringPtr(v string) *string {
|
||||
return &v
|
||||
}
|
||||
|
||||
@@ -125,6 +125,8 @@ func defaultTaskPriority(target string, params taskParams) int {
|
||||
return taskPriorityInstall
|
||||
case "install-to-ram":
|
||||
return taskPriorityInstallToRAM
|
||||
case "nvme-format":
|
||||
return taskPriorityInstall
|
||||
case "audit":
|
||||
return taskPriorityAudit
|
||||
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
||||
@@ -1038,6 +1040,81 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxStatus(w http.ResponseWriter, _ *http.Request) {
|
||||
state, err := app.ReadBlackboxState(filepath.Join(h.opts.ExportDir, "blackbox-state.json"))
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
writeJSON(w, app.BlackboxState{Status: "disabled", Targets: []app.BlackboxTargetStatus{}})
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if state.Targets == nil {
|
||||
state.Targets = []app.BlackboxTargetStatus{}
|
||||
}
|
||||
writeJSON(w, state)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxEnable(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var target platform.RemovableTarget
|
||||
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || strings.TrimSpace(target.Device) == "" {
|
||||
writeError(w, http.StatusBadRequest, "device is required")
|
||||
return
|
||||
}
|
||||
targets, err := h.opts.App.ListRemovableTargets()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
allowed := false
|
||||
for _, candidate := range targets {
|
||||
if candidate.Device == target.Device {
|
||||
target = candidate
|
||||
allowed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
writeError(w, http.StatusBadRequest, "device not in removable target list")
|
||||
return
|
||||
}
|
||||
marker, err := app.EnableBlackboxTarget(target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"status": "ok",
|
||||
"message": "Black-box marker written.",
|
||||
"enrollment_id": marker.EnrollmentID,
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxDisable(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Device string `json:"device"`
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if err := app.DisableBlackboxTarget(req.Device, req.EnrollmentID); err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
writeError(w, http.StatusNotFound, "black-box target not found")
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": "Black-box marker removed."})
|
||||
}
|
||||
|
||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
||||
@@ -1220,7 +1297,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
||||
var standardTools = []string{
|
||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||
"mstflint", "qrencode",
|
||||
"mstflint",
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
@@ -3,6 +3,8 @@ package webui
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
@@ -44,6 +46,66 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
|
||||
h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||
|
||||
h.handleAPIBlackboxStatus(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
var state app.BlackboxState
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
|
||||
t.Fatalf("decode state: %v", err)
|
||||
}
|
||||
if state.Status != "disabled" {
|
||||
t.Fatalf("status=%q want disabled", state.Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
|
||||
exportDir := t.TempDir()
|
||||
statePath := filepath.Join(exportDir, "blackbox-state.json")
|
||||
if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
|
||||
t.Fatalf("write state: %v", err)
|
||||
}
|
||||
h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||
|
||||
h.handleAPIBlackboxStatus(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
|
||||
t.Fatalf("body=%s", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVMeFormatModes(t *testing.T) {
|
||||
raw := `
|
||||
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||
lbaf 2 : ms:0 lbads:12 rp:0
|
||||
`
|
||||
modes := parseNVMeFormatModes(raw)
|
||||
if len(modes) != 3 {
|
||||
t.Fatalf("modes=%#v want 3 modes", modes)
|
||||
}
|
||||
if modes[0].Mode != 0 || modes[0].DataBytes != 512 || modes[0].MetadataBytes != 0 || !modes[0].InUse {
|
||||
t.Fatalf("mode 0=%#v", modes[0])
|
||||
}
|
||||
if modes[1].Label != "MODE 1 (512+8)" {
|
||||
t.Fatalf("mode 1 label=%q", modes[1].Label)
|
||||
}
|
||||
if modes[2].DataBytes != 4096 || modes[2].MetadataBytes != 0 {
|
||||
t.Fatalf("mode 2=%#v", modes[2])
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
|
||||
@@ -91,6 +91,7 @@ func (j *jobState) writeLogLineLocked(line string) {
|
||||
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||
}
|
||||
_, _ = j.logBuf.WriteString(line + "\n")
|
||||
_ = j.logBuf.Flush()
|
||||
}
|
||||
|
||||
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||
|
||||
368
audit/internal/webui/nvme_format.go
Normal file
368
audit/internal/webui/nvme_format.go
Normal file
@@ -0,0 +1,368 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type nvmeFormatMode struct {
|
||||
Mode int `json:"mode"`
|
||||
DataBytes int64 `json:"data_bytes"`
|
||||
MetadataBytes int64 `json:"metadata_bytes"`
|
||||
InUse bool `json:"in_use"`
|
||||
Label string `json:"label"`
|
||||
}
|
||||
|
||||
type nvmeFormatDisk struct {
|
||||
Device string `json:"device"`
|
||||
Model string `json:"model,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
Size string `json:"size,omitempty"`
|
||||
CurrentMode int `json:"current_mode"`
|
||||
CurrentFormat string `json:"current_format"`
|
||||
Modes []nvmeFormatMode `json:"modes"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type nvmeListJSON struct {
|
||||
Devices []struct {
|
||||
DevicePath string `json:"DevicePath"`
|
||||
ModelNumber string `json:"ModelNumber"`
|
||||
SerialNumber string `json:"SerialNumber"`
|
||||
PhysicalSize int64 `json:"PhysicalSize"`
|
||||
} `json:"Devices"`
|
||||
}
|
||||
|
||||
var (
|
||||
nvmeFormatDeviceRE = regexp.MustCompile(`^/dev/nvme[0-9]+n[0-9]+$`)
|
||||
nvmeLBAFCompactLineRE = regexp.MustCompile(`(?im)^\s*lbaf\s+(\d+)\s*:\s*ms:(\d+)\s+lbads:(\d+).*$`)
|
||||
nvmeLBAFVerboseLineRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+(\d+)\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*$`)
|
||||
nvmeCommandContext = exec.CommandContext
|
||||
nvmeListFormatsTimeout = 20 * time.Second
|
||||
)
|
||||
|
||||
func listNVMeFormatDisks(ctx context.Context) ([]nvmeFormatDisk, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, nvmeListFormatsTimeout)
|
||||
defer cancel()
|
||||
out, err := nvmeCommandContext(ctx, "nvme", "list", "-o", "json").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var root nvmeListJSON
|
||||
if err := json.Unmarshal(out, &root); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
disks := make([]nvmeFormatDisk, 0, len(root.Devices))
|
||||
seen := map[string]struct{}{}
|
||||
for _, dev := range root.Devices {
|
||||
path := strings.TrimSpace(dev.DevicePath)
|
||||
if !nvmeFormatDeviceRE.MatchString(path) {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[path]; ok {
|
||||
continue
|
||||
}
|
||||
seen[path] = struct{}{}
|
||||
disk := nvmeFormatDisk{
|
||||
Device: path,
|
||||
Model: strings.TrimSpace(dev.ModelNumber),
|
||||
Serial: strings.TrimSpace(dev.SerialNumber),
|
||||
Size: formatNVMeBytes(dev.PhysicalSize),
|
||||
CurrentMode: -1,
|
||||
}
|
||||
modes, parseErr := readNVMeFormatModes(ctx, path)
|
||||
if parseErr != nil {
|
||||
disk.Error = parseErr.Error()
|
||||
}
|
||||
disk.Modes = modes
|
||||
for _, mode := range modes {
|
||||
if mode.InUse {
|
||||
disk.CurrentMode = mode.Mode
|
||||
disk.CurrentFormat = formatNVMeBlock(mode.DataBytes, mode.MetadataBytes)
|
||||
break
|
||||
}
|
||||
}
|
||||
disks = append(disks, disk)
|
||||
}
|
||||
sort.Slice(disks, func(i, j int) bool { return disks[i].Device < disks[j].Device })
|
||||
return disks, nil
|
||||
}
|
||||
|
||||
func readNVMeFormatModes(ctx context.Context, device string) ([]nvmeFormatMode, error) {
|
||||
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||
return nil, fmt.Errorf("invalid NVMe device")
|
||||
}
|
||||
out, err := nvmeCommandContext(ctx, "nvme", "id-ns", device, "-H").CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
return nil, fmt.Errorf("%s", msg)
|
||||
}
|
||||
modes := parseNVMeFormatModes(string(out))
|
||||
if len(modes) == 0 {
|
||||
return nil, fmt.Errorf("no LBA format modes found")
|
||||
}
|
||||
return modes, nil
|
||||
}
|
||||
|
||||
func parseNVMeFormatModes(raw string) []nvmeFormatMode {
|
||||
byMode := map[int]nvmeFormatMode{}
|
||||
for _, m := range nvmeLBAFCompactLineRE.FindAllStringSubmatch(raw, -1) {
|
||||
mode, errMode := strconv.Atoi(m[1])
|
||||
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||
lbads, errLBADS := strconv.Atoi(m[3])
|
||||
if errMode != nil || errMS != nil || errLBADS != nil || lbads < 0 || lbads >= 63 {
|
||||
continue
|
||||
}
|
||||
data := int64(1) << lbads
|
||||
line := m[0]
|
||||
byMode[mode] = nvmeFormatMode{
|
||||
Mode: mode,
|
||||
DataBytes: data,
|
||||
MetadataBytes: metadata,
|
||||
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||
}
|
||||
}
|
||||
for _, m := range nvmeLBAFVerboseLineRE.FindAllStringSubmatch(raw, -1) {
|
||||
mode, errMode := strconv.Atoi(m[1])
|
||||
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||
data, errData := strconv.ParseInt(m[3], 10, 64)
|
||||
if errMode != nil || errMS != nil || errData != nil || data <= 0 {
|
||||
continue
|
||||
}
|
||||
line := m[0]
|
||||
byMode[mode] = nvmeFormatMode{
|
||||
Mode: mode,
|
||||
DataBytes: data,
|
||||
MetadataBytes: metadata,
|
||||
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||
}
|
||||
}
|
||||
modes := make([]nvmeFormatMode, 0, len(byMode))
|
||||
for _, mode := range byMode {
|
||||
modes = append(modes, mode)
|
||||
}
|
||||
sort.Slice(modes, func(i, j int) bool { return modes[i].Mode < modes[j].Mode })
|
||||
return modes
|
||||
}
|
||||
|
||||
func runNVMeFormatTask(ctx context.Context, j *jobState, device string, lbaf int) error {
|
||||
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||
return fmt.Errorf("invalid NVMe device")
|
||||
}
|
||||
modes, err := readNVMeFormatModes(ctx, device)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var selected nvmeFormatMode
|
||||
found := false
|
||||
for _, mode := range modes {
|
||||
if mode.Mode == lbaf {
|
||||
selected = mode
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return fmt.Errorf("MODE %d is not available on %s", lbaf, device)
|
||||
}
|
||||
ms := 0
|
||||
if selected.MetadataBytes > 0 {
|
||||
ms = 1
|
||||
}
|
||||
j.append(fmt.Sprintf("Formatting %s to %s with --lbaf=%d --ms=%d --force", device, formatNVMeBlock(selected.DataBytes, selected.MetadataBytes), selected.Mode, ms))
|
||||
cmd := nvmeCommandContext(ctx, "nvme", "format", device, fmt.Sprintf("--lbaf=%d", selected.Mode), fmt.Sprintf("--ms=%d", ms), "--force")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINVMeFormats(w http.ResponseWriter, r *http.Request) {
|
||||
disks, err := listNVMeFormatDisks(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, disks)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINVMeFormatRun(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Device string `json:"device"`
|
||||
LBAF int `json:"lbaf"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if !nvmeFormatDeviceRE.MatchString(req.Device) {
|
||||
writeError(w, http.StatusBadRequest, "invalid NVMe device")
|
||||
return
|
||||
}
|
||||
disks, err := listNVMeFormatDisks(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
var label string
|
||||
allowed := false
|
||||
for _, disk := range disks {
|
||||
if disk.Device != req.Device {
|
||||
continue
|
||||
}
|
||||
for _, mode := range disk.Modes {
|
||||
if mode.Mode == req.LBAF {
|
||||
allowed = true
|
||||
label = mode.Label
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
writeError(w, http.StatusBadRequest, "LBA format mode is not available for this device")
|
||||
return
|
||||
}
|
||||
name := fmt.Sprintf("NVMe Format %s to %s", filepath.Base(req.Device), label)
|
||||
t := &Task{
|
||||
ID: newJobID("nvme-format"),
|
||||
Name: name,
|
||||
Target: "nvme-format",
|
||||
Priority: defaultTaskPriority("nvme-format", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Device: req.Device,
|
||||
LBAF: req.LBAF,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
|
||||
func formatNVMeBlock(dataBytes, metadataBytes int64) string {
|
||||
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||
}
|
||||
|
||||
func formatNVMeBytes(n int64) string {
|
||||
if n <= 0 {
|
||||
return ""
|
||||
}
|
||||
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||
v := float64(n)
|
||||
unit := 0
|
||||
for v >= 1000 && unit < len(units)-1 {
|
||||
v /= 1000
|
||||
unit++
|
||||
}
|
||||
if unit == 0 {
|
||||
return fmt.Sprintf("%d B", n)
|
||||
}
|
||||
return fmt.Sprintf("%.1f %s", v, units[unit])
|
||||
}
|
||||
|
||||
func renderNVMeFormatInline() string {
|
||||
return `<div id="nvme-format-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVMe disks...</div>
|
||||
<div id="nvme-format-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<script>
|
||||
function nvmeFormatEsc(s) {
|
||||
return String(s == null ? '' : s).replace(/[&<>"']/g, function(c) {
|
||||
return {'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];
|
||||
});
|
||||
}
|
||||
function loadNVMeFormats() {
|
||||
var status = document.getElementById('nvme-format-status');
|
||||
var table = document.getElementById('nvme-format-table');
|
||||
status.textContent = 'Loading NVMe disks...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||
fetch('/api/tools/nvme-formats').then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(disks) {
|
||||
window._nvmeFormatDisks = Array.isArray(disks) ? disks : [];
|
||||
if (!window._nvmeFormatDisks.length) {
|
||||
status.textContent = 'No NVMe disks found.';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
status.textContent = window._nvmeFormatDisks.length + ' NVMe disk(s) found.';
|
||||
var rows = window._nvmeFormatDisks.map(function(d, idx) {
|
||||
var current = d.current_format ? (d.current_format + ' / MODE ' + d.current_mode) : 'unknown';
|
||||
var detail = [d.model || '', d.serial || '', d.size || ''].filter(Boolean).join(' | ');
|
||||
var options = (d.modes || []).map(function(m) {
|
||||
return '<option value="' + m.mode + '"' + (m.in_use ? ' selected' : '') + '>' + nvmeFormatEsc(m.label) + '</option>';
|
||||
}).join('');
|
||||
var disabled = options ? '' : ' disabled';
|
||||
var err = d.error ? '<div style="font-size:12px;color:var(--crit-fg,#9f3a38);margin-top:4px">' + nvmeFormatEsc(d.error) + '</div>' : '';
|
||||
return '<tr>'
|
||||
+ '<td style="font-family:monospace;white-space:nowrap">' + nvmeFormatEsc(d.device) + (detail ? '<div style="font-family:inherit;font-size:12px;color:var(--muted)">' + nvmeFormatEsc(detail) + '</div>' : '') + '</td>'
|
||||
+ '<td style="white-space:nowrap">' + nvmeFormatEsc(current) + err + '</td>'
|
||||
+ '<td style="white-space:nowrap"><select id="nvme-format-select-' + idx + '"' + disabled + '>' + options + '</select></td>'
|
||||
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-primary" onclick="nvmeFormatRun(' + idx + ', this)"' + disabled + '>Apply</button><div class="nvme-format-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div></td>'
|
||||
+ '</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table><tr><th>Disk</th><th>Current block / mode</th><th>New mode</th><th>Action</th></tr>' + rows + '</table>';
|
||||
}).catch(function(e) {
|
||||
status.textContent = 'Error loading NVMe disks: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
table.innerHTML = '';
|
||||
});
|
||||
}
|
||||
function nvmeWaitTaskDone(taskID, rowMsg) {
|
||||
var timer = setInterval(function() {
|
||||
fetch('/api/tasks').then(function(r) { return r.json(); }).then(function(tasks) {
|
||||
var task = (tasks || []).find(function(t) { return t.id === taskID; });
|
||||
if (!task) return;
|
||||
if (task.status === 'done' || task.status === 'failed' || task.status === 'cancelled') {
|
||||
clearInterval(timer);
|
||||
rowMsg.textContent = 'Task ' + taskID + ': ' + task.status + (task.error ? ' - ' + task.error : '');
|
||||
rowMsg.style.color = task.status === 'done' ? 'var(--ok,green)' : 'var(--crit-fg,#9f3a38)';
|
||||
loadNVMeFormats();
|
||||
}
|
||||
}).catch(function(){});
|
||||
}, 1500);
|
||||
}
|
||||
function nvmeFormatRun(idx, btn) {
|
||||
var disk = (window._nvmeFormatDisks || [])[idx];
|
||||
var select = document.getElementById('nvme-format-select-' + idx);
|
||||
var row = btn.closest('td');
|
||||
var rowMsg = row.querySelector('.nvme-format-row-msg');
|
||||
if (!disk || !select) return;
|
||||
var lbaf = parseInt(select.value, 10);
|
||||
var mode = (disk.modes || []).find(function(m) { return m.mode === lbaf; });
|
||||
if (!mode) return;
|
||||
if (!window.confirm('Format ' + disk.device + ' to ' + mode.label + '? This erases data on the namespace.')) return;
|
||||
btn.disabled = true;
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Queued...';
|
||||
fetch('/api/tools/nvme-format/run', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({device: disk.device, lbaf: lbaf})
|
||||
}).then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(d) {
|
||||
rowMsg.textContent = 'Task ' + d.task_id + ' queued.';
|
||||
nvmeWaitTaskDone(d.task_id, rowMsg);
|
||||
}).catch(function(e) {
|
||||
rowMsg.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
rowMsg.textContent = 'Error: ' + e.message;
|
||||
}).finally(function() {
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
loadNVMeFormats();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNVMeFormatCard() string {
|
||||
return `<div class="card"><div class="card-head">NVMe Block Format <button class="btn btn-sm btn-secondary" onclick="loadNVMeFormats()" style="margin-left:auto">↻ Refresh</button></div><div class="card-body">` +
|
||||
`<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Lists NVMe namespaces and changes their LBA format through a queued task.</p>` +
|
||||
renderNVMeFormatInline() + `</div></div>`
|
||||
}
|
||||
@@ -102,47 +102,69 @@ window.supportBundleDownload = function() {
|
||||
|
||||
func renderUSBExportCard() string {
|
||||
return `<div class="card" style="margin-top:16px">
|
||||
<div class="card-head">Export to USB
|
||||
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
<div class="card-head">USB Black-Box
|
||||
<button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
</div>
|
||||
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||
</div>`
|
||||
}
|
||||
|
||||
func renderUSBExportInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
|
||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||
<div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
|
||||
<div id="usb-targets" style="margin-top:12px"></div>
|
||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||
<script>
|
||||
(function(){
|
||||
function usbRefresh() {
|
||||
function blackboxRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||
document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
|
||||
document.getElementById('usb-targets').innerHTML = '';
|
||||
document.getElementById('usb-msg').textContent = '';
|
||||
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
||||
window._usbTargets = Array.isArray(targets) ? targets : [];
|
||||
Promise.all([
|
||||
fetch('/api/export/usb').then(r=>r.json()),
|
||||
fetch('/api/blackbox/status').then(r=>r.json())
|
||||
]).then(function(values) {
|
||||
const targets = Array.isArray(values[0]) ? values[0] : [];
|
||||
const state = values[1] || {};
|
||||
const active = Array.isArray(state.targets) ? state.targets : [];
|
||||
window._usbTargets = targets;
|
||||
window._blackboxTargets = active;
|
||||
const st = document.getElementById('usb-status');
|
||||
const ct = document.getElementById('usb-targets');
|
||||
const summary = document.getElementById('blackbox-summary');
|
||||
if (state.boot_folder) {
|
||||
summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
|
||||
} else {
|
||||
summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
|
||||
}
|
||||
if (!targets || targets.length === 0) {
|
||||
st.textContent = 'No removable USB devices found.';
|
||||
return;
|
||||
} else {
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
}
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
||||
const byDevice = {};
|
||||
active.forEach(function(item) { byDevice[item.device] = item; });
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
|
||||
targets.map((t, idx) => {
|
||||
const dev = t.device || '';
|
||||
const label = t.label || '';
|
||||
const model = t.model || '';
|
||||
const state = byDevice[dev];
|
||||
const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
|
||||
const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
|
||||
return '<tr>' +
|
||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||
'<td>'+t.fs_type+'</td>' +
|
||||
'<td>'+t.size+'</td>' +
|
||||
'<td>'+label+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||
'<td style="font-size:12px">'+status+detail+'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+idx+',this)">Audit JSON</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+idx+',this)">Support Bundle</button>' +
|
||||
(state
|
||||
? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
|
||||
: '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
|
||||
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||
'</td></tr>';
|
||||
}).join('') + '</table>';
|
||||
@@ -150,7 +172,7 @@ function usbRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||
});
|
||||
}
|
||||
window.usbExport = function(type, targetIndex, btn) {
|
||||
window.blackboxEnable = function(targetIndex, btn) {
|
||||
const target = (window._usbTargets || [])[targetIndex];
|
||||
if (!target) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
@@ -164,15 +186,15 @@ window.usbExport = function(type, targetIndex, btn) {
|
||||
const originalText = btn ? btn.textContent : '';
|
||||
if (btn) {
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Exporting...';
|
||||
btn.textContent = 'Enabling...';
|
||||
}
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Working...';
|
||||
}
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
|
||||
fetch('/api/export/usb/'+type, {
|
||||
msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
|
||||
fetch('/api/blackbox/enable', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(target)
|
||||
@@ -199,10 +221,64 @@ window.usbExport = function(type, targetIndex, btn) {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
setTimeout(blackboxRefresh, 300);
|
||||
});
|
||||
};
|
||||
window.usbRefresh = usbRefresh;
|
||||
usbRefresh();
|
||||
window.blackboxDisable = function(targetIndex, btn) {
|
||||
const target = (window._usbTargets || [])[targetIndex];
|
||||
const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
|
||||
if (!target || !active) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: black-box target not found. Refresh and try again.';
|
||||
return;
|
||||
}
|
||||
const msg = document.getElementById('usb-msg');
|
||||
const row = btn ? btn.closest('td') : null;
|
||||
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||
const originalText = btn ? btn.textContent : '';
|
||||
if (btn) {
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Disabling...';
|
||||
}
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Working...';
|
||||
}
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
|
||||
fetch('/api/blackbox/disable', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
|
||||
}).then(async r => {
|
||||
const d = await r.json();
|
||||
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||
return d;
|
||||
}).then(d => {
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--ok,green)';
|
||||
rowMsg.textContent = d.message || 'Done.';
|
||||
}
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--err,red)';
|
||||
rowMsg.textContent = 'Error: ' + e;
|
||||
}
|
||||
}).finally(() => {
|
||||
if (btn) {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
setTimeout(blackboxRefresh, 300);
|
||||
});
|
||||
};
|
||||
window.blackboxRefresh = blackboxRefresh;
|
||||
blackboxRefresh();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
@@ -382,7 +458,7 @@ function installToRAM() {
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Export to USB</div>
|
||||
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
</div></div>
|
||||
@@ -399,6 +475,7 @@ function installToRAM() {
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
` + renderNVMeFormatCard() + `
|
||||
|
||||
<script>
|
||||
function checkTools() {
|
||||
|
||||
@@ -301,11 +301,14 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Export
|
||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
||||
mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
|
||||
mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
|
||||
mux.HandleFunc("GET /api/blackbox/status", h.handleAPIBlackboxStatus)
|
||||
mux.HandleFunc("POST /api/blackbox/enable", h.handleAPIBlackboxEnable)
|
||||
mux.HandleFunc("POST /api/blackbox/disable", h.handleAPIBlackboxDisable)
|
||||
|
||||
// Tools
|
||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||
mux.HandleFunc("GET /api/tools/nvme-formats", h.handleAPINVMeFormats)
|
||||
mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
@@ -571,6 +574,7 @@ func (h *handler) handleExportIndex(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) {
|
||||
snapshot, _ := loadSnapshot(h.opts.AuditPath)
|
||||
snapshot = enrichSnapshotForViewer(snapshot)
|
||||
body, err := viewer.RenderHTML(snapshot, h.opts.Title)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
|
||||
@@ -671,11 +671,17 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Export to USB`) {
|
||||
t.Fatalf("tools page missing export to usb section: %s", body)
|
||||
if !strings.Contains(body, `USB Black-Box`) {
|
||||
t.Fatalf("tools page missing usb black-box section: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Support Bundle</button>`) {
|
||||
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
||||
if !strings.Contains(body, `/api/blackbox/status`) {
|
||||
t.Fatalf("tools page missing black-box status api usage: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `NVMe Block Format`) {
|
||||
t.Fatalf("tools page missing nvme block format section: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/tools/nvme-formats`) || !strings.Contains(body, `/api/tools/nvme-format/run`) {
|
||||
t.Fatalf("tools page missing nvme format api usage: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1016,6 +1022,39 @@ func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewerRendersDerivedStorageBlockFormat(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
body := `{
|
||||
"collected_at":"2026-04-29T00:05:00Z",
|
||||
"hardware":{
|
||||
"board":{"serial_number":"SERIAL-NEW"},
|
||||
"storage":[
|
||||
{
|
||||
"serial_number":"DISK-1",
|
||||
"model":"Test NVMe",
|
||||
"logical_block_size_bytes":512,
|
||||
"physical_block_size_bytes":4096,
|
||||
"metadata_bytes_per_block":8
|
||||
}
|
||||
]
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/viewer", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
if !strings.Contains(rec.Body.String(), "512+8") {
|
||||
t.Fatalf("viewer body missing derived block format: %s", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -1038,6 +1077,36 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditJSONDoesNotInjectDerivedStorageBlockFormat(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
body := `{
|
||||
"hardware":{
|
||||
"board":{"serial_number":"SERIAL-API"},
|
||||
"storage":[
|
||||
{
|
||||
"serial_number":"DISK-1",
|
||||
"logical_block_size_bytes":512,
|
||||
"metadata_bytes_per_block":8
|
||||
}
|
||||
]
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit.json", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
if strings.Contains(rec.Body.String(), "block_format") {
|
||||
t.Fatalf("audit.json should remain contract-only: %s", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingAuditJSONReturnsNotFound(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{AuditPath: "/missing/audit.json"})
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
@@ -376,6 +376,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
|
||||
break
|
||||
}
|
||||
err = a.RunInstallToRAM(ctx, j.append)
|
||||
case "nvme-format":
|
||||
if strings.TrimSpace(t.params.Device) == "" {
|
||||
err = fmt.Errorf("device is required")
|
||||
break
|
||||
}
|
||||
err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
|
||||
@@ -57,6 +57,7 @@ var taskNames = map[string]string{
|
||||
"support-bundle": "Support Bundle",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
"nvme-format": "NVMe Block Format Change",
|
||||
}
|
||||
|
||||
// burnNames maps target → human-readable name when a burn profile is set.
|
||||
@@ -137,6 +138,7 @@ type taskParams struct {
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
LBAF int `json:"lbaf,omitempty"`
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
}
|
||||
|
||||
@@ -598,6 +600,17 @@ func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
|
||||
}
|
||||
|
||||
func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
|
||||
startedKmsgWatch := false
|
||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||
startedKmsgWatch = true
|
||||
}
|
||||
defer func() {
|
||||
if startedKmsgWatch && q.kmsgWatcher != nil {
|
||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||
}
|
||||
}()
|
||||
|
||||
stopTail := make(chan struct{})
|
||||
doneTail := make(chan struct{})
|
||||
defer func() {
|
||||
|
||||
@@ -126,6 +126,23 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestJobAppendFlushesTaskLogImmediately(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "task.log")
|
||||
j := newTaskJobState(path)
|
||||
|
||||
j.append("live-line")
|
||||
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if string(data) != "live-line\n" {
|
||||
t.Fatalf("log=%q want live-line newline", string(data))
|
||||
}
|
||||
j.closeLog()
|
||||
}
|
||||
|
||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||
q := &taskQueue{
|
||||
@@ -849,3 +866,82 @@ func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
|
||||
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskExternalOpensAndClosesKmsgWindow(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
releasePath := filepath.Join(dir, "release")
|
||||
readyPath := filepath.Join(dir, "ready")
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{ExportDir: dir},
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
kmsgWatcher: newKmsgWatcher(nil),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-external-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
q.assignTaskLogPathLocked(tk)
|
||||
j := newTaskJobState(tk.LogPath)
|
||||
|
||||
orig := externalTaskRunnerCommand
|
||||
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
|
||||
script := "printf ready > \"$1\"; while [ ! -f \"$2\" ]; do sleep 0.05; done"
|
||||
return exec.Command("sh", "-c", script, "sh", readyPath, releasePath), nil
|
||||
}
|
||||
defer func() { externalTaskRunnerCommand = orig }()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
q.runTaskExternal(tk, j)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if _, err := os.Stat(readyPath); err == nil {
|
||||
break
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
if _, err := os.Stat(readyPath); err != nil {
|
||||
t.Fatalf("external runner did not start: %v", err)
|
||||
}
|
||||
|
||||
q.kmsgWatcher.mu.Lock()
|
||||
activeCount := q.kmsgWatcher.activeCount
|
||||
window := q.kmsgWatcher.window
|
||||
q.kmsgWatcher.mu.Unlock()
|
||||
if activeCount != 1 {
|
||||
t.Fatalf("activeCount while running=%d want 1", activeCount)
|
||||
}
|
||||
if window == nil || len(window.targets) != 1 || window.targets[0] != "cpu" {
|
||||
t.Fatalf("window while running=%+v", window)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(releasePath, []byte("1\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("runTaskExternal did not return")
|
||||
}
|
||||
|
||||
q.kmsgWatcher.mu.Lock()
|
||||
activeCount = q.kmsgWatcher.activeCount
|
||||
window = q.kmsgWatcher.window
|
||||
q.kmsgWatcher.mu.Unlock()
|
||||
if activeCount != 0 {
|
||||
t.Fatalf("activeCount after finish=%d want 0", activeCount)
|
||||
}
|
||||
if window != nil {
|
||||
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||
}
|
||||
}
|
||||
|
||||
62
audit/internal/webui/viewer_snapshot.go
Normal file
62
audit/internal/webui/viewer_snapshot.go
Normal file
@@ -0,0 +1,62 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func enrichSnapshotForViewer(snapshot []byte) []byte {
|
||||
if len(snapshot) == 0 {
|
||||
return snapshot
|
||||
}
|
||||
var root map[string]any
|
||||
if err := json.Unmarshal(snapshot, &root); err != nil {
|
||||
return snapshot
|
||||
}
|
||||
hardware, _ := root["hardware"].(map[string]any)
|
||||
if len(hardware) == 0 {
|
||||
return snapshot
|
||||
}
|
||||
storage, _ := hardware["storage"].([]any)
|
||||
if len(storage) == 0 {
|
||||
return snapshot
|
||||
}
|
||||
changed := false
|
||||
for _, item := range storage {
|
||||
row, _ := item.(map[string]any)
|
||||
if len(row) == 0 {
|
||||
continue
|
||||
}
|
||||
if _, exists := row["block_format"]; exists {
|
||||
continue
|
||||
}
|
||||
logical, okLogical := jsonNumberToInt64(row["logical_block_size_bytes"])
|
||||
metadata, okMetadata := jsonNumberToInt64(row["metadata_bytes_per_block"])
|
||||
if !okLogical || !okMetadata || logical <= 0 || metadata < 0 {
|
||||
continue
|
||||
}
|
||||
row["block_format"] = strconv.FormatInt(logical, 10) + "+" + strconv.FormatInt(metadata, 10)
|
||||
changed = true
|
||||
}
|
||||
if !changed {
|
||||
return snapshot
|
||||
}
|
||||
out, err := json.Marshal(root)
|
||||
if err != nil {
|
||||
return snapshot
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func jsonNumberToInt64(v any) (int64, bool) {
|
||||
switch x := v.(type) {
|
||||
case float64:
|
||||
return int64(x), true
|
||||
case int64:
|
||||
return x, true
|
||||
case int:
|
||||
return int64(x), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
2
bible
2
bible
Submodule bible updated: 1d89a4918e...d2600f1279
@@ -9,5 +9,62 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
||||
|---|---|
|
||||
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
||||
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
||||
| `docs/customer-gpu-test-methodology.md` | Customer-facing GPU PCIe Validate / Validate -> Stress test list |
|
||||
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||
| `decisions/` | Architectural decision log |
|
||||
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
|
||||
| `decisions/` | Architectural decision log, including read-only submodule policy |
|
||||
|
||||
## Validate Test Matrix
|
||||
|
||||
### Validate
|
||||
|
||||
- CPU check
|
||||
- `lscpu`
|
||||
- `sensors`
|
||||
- `stress-ng`
|
||||
- Memory check
|
||||
- `free`
|
||||
- `timeout <timeout_sec> memtester`
|
||||
- `free`
|
||||
- NVMe storage check
|
||||
- `nvme id-ctrl`
|
||||
- `nvme smart-log`
|
||||
- `nvme device-self-test`
|
||||
- SATA/SAS storage check
|
||||
- `smartctl -H -A`
|
||||
- `smartctl -t short`
|
||||
- Basic NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 2`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
### Validate -> Stress
|
||||
|
||||
- Extended NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 3`
|
||||
- NVIDIA targeted stress
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_stress`
|
||||
- NVIDIA targeted power
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_power`
|
||||
- NVIDIA pulse test
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r pulse_test`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
@@ -149,7 +149,6 @@ Current validation state:
|
||||
6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
|
||||
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
|
||||
8. output JSON → /var/log/bee-audit.json
|
||||
9. QR summary to stdout (qrencode if available)
|
||||
```
|
||||
|
||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||
|
||||
@@ -58,6 +58,8 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
|
||||
- Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
|
||||
- Contract fields that have no honest local source on a generic Linux host may remain empty.
|
||||
- Embedded submodules such as `internal/chart/` and `bible/` are read-only for `bee` feature work.
|
||||
- If the UI needs extra information, `bee` must emit it through the standard audit JSON contract rather than patching `chart`.
|
||||
|
||||
## Tech stack
|
||||
|
||||
@@ -101,7 +103,7 @@ Fills gaps where Redfish/logpile is blind:
|
||||
| `iso/builder/` | ISO build scripts and `live-build` profile |
|
||||
| `iso/overlay/` | Source overlay copied into a staged build overlay |
|
||||
| `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, arcconf, ssacli, …) |
|
||||
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
|
||||
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web`; update by submodule pointer only, never by local `bee`-specific edits |
|
||||
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
|
||||
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
||||
| `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
# Decision: Treat embedded submodules as read-only
|
||||
|
||||
## Context
|
||||
|
||||
`bee` embeds external git submodules such as:
|
||||
|
||||
- `internal/chart/` — `reanimator/chart`, a generic read-only viewer for Reanimator JSON snapshots
|
||||
- `bible/` — shared engineering rules and contracts
|
||||
|
||||
These repositories are reused by other projects. A local feature request in `bee`
|
||||
must not be solved by silently changing shared submodule behavior.
|
||||
|
||||
The concrete failure mode here was attempting to add project-specific storage
|
||||
telemetry presentation by editing `internal/chart/`. That couples a shared viewer
|
||||
to one host application's needs and creates hidden cross-project regressions.
|
||||
|
||||
## Decision
|
||||
|
||||
Embedded submodules are read-only from the point of view of `bee`.
|
||||
|
||||
- Do not implement `bee`-specific behavior by editing `internal/chart/`.
|
||||
- Do not implement `bee`-specific behavior by editing `bible/`.
|
||||
- If `bee` needs new data in the report, produce it in the standard audit JSON
|
||||
emitted by `bee` itself.
|
||||
- `chart` must continue to consume the canonical snapshot as an external viewer,
|
||||
without host-specific forks.
|
||||
- Updating a submodule pointer to an upstream commit is allowed.
|
||||
- Carrying local unmerged submodule commits as part of a `bee` feature is forbidden.
|
||||
|
||||
## Consequences
|
||||
|
||||
- Audit/report features must be expressed through the contract in
|
||||
`bible-local/docs/hardware-ingest-contract.md`.
|
||||
- `bee` owns collection, normalization, and serialization of storage telemetry in
|
||||
`hardware.storage[]`.
|
||||
- `chart` remains a pure visualization module that reads the snapshot it is given.
|
||||
- If a capability is genuinely missing in a shared submodule, it must be proposed
|
||||
and landed upstream as a generic change first, then pulled into `bee` via a
|
||||
normal submodule update.
|
||||
@@ -6,3 +6,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
||||
|---|---|---|
|
||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||
| 2026-04-29 | Treat embedded submodules as read-only | active |
|
||||
|
||||
54
bible-local/docs/customer-gpu-test-methodology.md
Normal file
54
bible-local/docs/customer-gpu-test-methodology.md
Normal file
@@ -0,0 +1,54 @@
|
||||
# GPU PCIe Test Methodology
|
||||
|
||||
## Validate
|
||||
|
||||
- CPU check
|
||||
- `lscpu`
|
||||
- `sensors`
|
||||
- `stress-ng`
|
||||
- Memory check
|
||||
- `free`
|
||||
- `timeout <timeout_sec> memtester`
|
||||
- `free`
|
||||
- NVMe storage check
|
||||
- `nvme id-ctrl`
|
||||
- `nvme smart-log`
|
||||
- `nvme device-self-test`
|
||||
- SATA/SAS storage check
|
||||
- `smartctl -H -A`
|
||||
- `smartctl -t short`
|
||||
- Basic NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 2`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
## Validate -> Stress
|
||||
|
||||
- Extended NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 3`
|
||||
- NVIDIA targeted stress
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_stress`
|
||||
- NVIDIA targeted power
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_power`
|
||||
- NVIDIA pulse test
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r pulse_test`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
title: Hardware Ingest JSON Contract
|
||||
version: "2.7"
|
||||
updated: "2026-03-15"
|
||||
version: "2.10"
|
||||
updated: "2026-04-29"
|
||||
maintainer: Reanimator Core
|
||||
audience: external-integrators, ai-agents
|
||||
language: ru
|
||||
@@ -9,7 +9,7 @@ language: ru
|
||||
|
||||
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
|
||||
|
||||
Версия: **2.7** · Дата: **2026-03-15**
|
||||
Версия: **2.10** · Дата: **2026-04-29**
|
||||
|
||||
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
|
||||
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
|
||||
@@ -22,6 +22,9 @@ language: ru
|
||||
|
||||
| Версия | Дата | Изменения |
|
||||
|--------|------|-----------|
|
||||
| 2.10 | 2026-04-29 | Для `hardware.storage[]` добавлены необязательные числовые поля `logical_block_size_bytes`, `physical_block_size_bytes`, `metadata_bytes_per_block` для нормализованного описания формата блока накопителя |
|
||||
| 2.9 | 2026-03-19 | Добавлена необязательная секция `hardware.platform_config` — произвольный объект с настройками платформы (BIOS/Redfish); хранится как latest-snapshot per machine |
|
||||
| 2.8 | 2026-03-15 | Поле `location` удалено из всех `sensors.*`; сенсоры передаются только по `name` и измеренным значениям |
|
||||
| 2.7 | 2026-03-15 | Явно запрещён синтез данных в `event_logs`; интеграторы не должны придумывать серийные номера компонентов, если источник их не отдал |
|
||||
| 2.6 | 2026-03-15 | Добавлена необязательная секция `event_logs` для dedup/upsert логов `host` / `bmc` / `redfish` вне history timeline |
|
||||
| 2.5 | 2026-03-15 | Добавлено общее необязательное поле `manufactured_year_week` для компонентных секций (`YYYY-Www`) |
|
||||
@@ -131,8 +134,9 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
"storage": [ ... ],
|
||||
"pcie_devices": [ ... ],
|
||||
"power_supplies": [ ... ],
|
||||
"sensors": { ... },
|
||||
"event_logs": [ ... ]
|
||||
"sensors": { ... },
|
||||
"event_logs": [ ... ],
|
||||
"platform_config": { ... }
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -343,6 +347,9 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
| `type` | string | нет | Тип: `NVMe`, `SSD`, `HDD` |
|
||||
| `interface` | string | нет | Интерфейс: `NVMe`, `SATA`, `SAS` |
|
||||
| `size_gb` | int | нет | Размер в ГБ |
|
||||
| `logical_block_size_bytes` | int64 | нет | Логический размер пользовательского блока данных, например `512` или `4096` |
|
||||
| `physical_block_size_bytes` | int64 | нет | Физический размер блока, если известен, например `4096` |
|
||||
| `metadata_bytes_per_block` | int64 | нет | Metadata / protection bytes на логический блок, например `0` или `8` |
|
||||
| `temperature_c` | float | нет | Температура накопителя, °C (telemetry) |
|
||||
| `power_on_hours` | int64 | нет | Время работы, часы |
|
||||
| `power_cycles` | int64 | нет | Количество циклов питания |
|
||||
@@ -363,6 +370,11 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
|
||||
Диск без `serial_number` игнорируется. Изменение `firmware` создаёт событие `FIRMWARE_CHANGED`.
|
||||
|
||||
Формат вида `512+8` в контракт не добавляется отдельным строковым полем. Если источник знает такую форму, он должен передавать её как:
|
||||
- `logical_block_size_bytes = 512`
|
||||
- `metadata_bytes_per_block = 8`
|
||||
- `physical_block_size_bytes = 512` или `4096`, если известен физический размер блока
|
||||
|
||||
```json
|
||||
"storage": [
|
||||
{
|
||||
@@ -370,6 +382,9 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
"type": "NVMe",
|
||||
"model": "INTEL SSDPF2KX076T1",
|
||||
"size_gb": 7680,
|
||||
"logical_block_size_bytes": 512,
|
||||
"physical_block_size_bytes": 4096,
|
||||
"metadata_bytes_per_block": 8,
|
||||
"temperature_c": 38.5,
|
||||
"power_on_hours": 12450,
|
||||
"unsafe_shutdowns": 3,
|
||||
@@ -592,7 +607,6 @@ PSU без `serial_number` игнорируется.
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `name` | string | **да** | Уникальное имя сенсора в рамках секции |
|
||||
| `location` | string | нет | Физическое расположение |
|
||||
| `rpm` | int | нет | Обороты, RPM |
|
||||
| `status` | string | нет | Статус: `OK`, `Warning`, `Critical`, `Unknown` |
|
||||
|
||||
@@ -601,7 +615,6 @@ PSU без `serial_number` игнорируется.
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `name` | string | **да** | Уникальное имя сенсора |
|
||||
| `location` | string | нет | Физическое расположение |
|
||||
| `voltage_v` | float | нет | Напряжение, В |
|
||||
| `current_a` | float | нет | Ток, А |
|
||||
| `power_w` | float | нет | Мощность, Вт |
|
||||
@@ -612,7 +625,6 @@ PSU без `serial_number` игнорируется.
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `name` | string | **да** | Уникальное имя сенсора |
|
||||
| `location` | string | нет | Физическое расположение |
|
||||
| `celsius` | float | нет | Температура, °C |
|
||||
| `threshold_warning_celsius` | float | нет | Порог Warning, °C |
|
||||
| `threshold_critical_celsius` | float | нет | Порог Critical, °C |
|
||||
@@ -623,29 +635,29 @@ PSU без `serial_number` игнорируется.
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `name` | string | **да** | Уникальное имя сенсора |
|
||||
| `location` | string | нет | Физическое расположение |
|
||||
| `value` | float | нет | Значение |
|
||||
| `unit` | string | нет | Единица измерения |
|
||||
| `status` | string | нет | Статус |
|
||||
|
||||
**Правила sensors:**
|
||||
- Идентификатор сенсора: пара `(sensor_type, name)`. Дубли в одном payload — берётся первое вхождение.
|
||||
- `location` для сенсоров передавать не нужно и не следует: в Reanimator location/slot используется только для проверки перемещения и установки компонентов, а не для last-known-value sensor ingest.
|
||||
- Сенсоры без `name` игнорируются.
|
||||
- При каждом импорте значения перезаписываются (upsert по ключу).
|
||||
|
||||
```json
|
||||
"sensors": {
|
||||
"fans": [
|
||||
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" },
|
||||
{ "name": "FAN_CPU0", "location": "CPU0", "rpm": 5600, "status": "OK" }
|
||||
{ "name": "FAN1", "rpm": 4200, "status": "OK" },
|
||||
{ "name": "FAN_CPU0", "rpm": 5600, "status": "OK" }
|
||||
],
|
||||
"power": [
|
||||
{ "name": "12V Rail", "location": "Mainboard", "voltage_v": 12.06, "status": "OK" },
|
||||
{ "name": "PSU0 Input", "location": "PSU0", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
|
||||
{ "name": "12V Rail", "voltage_v": 12.06, "status": "OK" },
|
||||
{ "name": "PSU0 Input", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
|
||||
],
|
||||
"temperatures": [
|
||||
{ "name": "CPU0 Temp", "location": "CPU0", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
|
||||
{ "name": "Inlet Temp", "location": "Front", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
|
||||
{ "name": "CPU0 Temp", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
|
||||
{ "name": "Inlet Temp", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
|
||||
],
|
||||
"other": [
|
||||
{ "name": "System Humidity", "value": 38.5, "unit": "%", "status": "OK" }
|
||||
@@ -655,6 +667,31 @@ PSU без `serial_number` игнорируется.
|
||||
|
||||
---
|
||||
|
||||
## Секция platform_config
|
||||
|
||||
Необязательный объект с произвольными ключами — настройки платформы как есть из источника (BIOS, Redfish, IPMI).
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `platform_config` | object | нет | Произвольный объект: ключи — строки, значения — строки, числа, булевы |
|
||||
|
||||
**Правила platform_config:**
|
||||
- Содержимое объекта не валидируется: передавайте параметры как есть.
|
||||
- При каждом импорте хранится latest-snapshot per machine; история изменений по каждому ключу накапливается отдельно.
|
||||
- Если секция отсутствует или равна `null` — данные платформы не обновляются.
|
||||
|
||||
```json
|
||||
"platform_config": {
|
||||
"SecureBoot": "Enabled",
|
||||
"BiosVersion": "06.08.05",
|
||||
"TpmEnabled": true,
|
||||
"NumaEnabled": false,
|
||||
"HyperThreading": "Enabled"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Обработка статусов компонентов
|
||||
|
||||
| Статус | Поведение |
|
||||
@@ -787,6 +824,12 @@ PSU без `serial_number` игнорируется.
|
||||
"other": [
|
||||
{ "name": "System Humidity", "value": 38.5, "unit": "%" }
|
||||
]
|
||||
},
|
||||
"platform_config": {
|
||||
"SecureBoot": "Enabled",
|
||||
"BiosVersion": "06.08.05",
|
||||
"TpmEnabled": true,
|
||||
"HyperThreading": "Enabled"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
31
bible-local/rules/patterns/ascii-safe-text/contract.md
Normal file
31
bible-local/rules/patterns/ascii-safe-text/contract.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Contract: ASCII-Safe Text in Scripts and Boot Configs
|
||||
|
||||
Version: 1.0
|
||||
|
||||
## Principle
|
||||
|
||||
Shell scripts, bootloader configs, and any text rendered on serial/SOL consoles must use only printable ASCII characters. Non-ASCII Unicode — including typographic punctuation such as the em-dash (U+2014 `—`), en-dash (U+2013 `–`), curly quotes, and ellipsis (U+2026 `…`) — breaks rendering on serial terminals, GRUB text/serial mode, IPMI SOL, and tooling that assumes ASCII.
|
||||
|
||||
## Rules
|
||||
|
||||
- Never use em-dash (`—`) or en-dash (`–`) in any shell script, GRUB config, syslinux/isolinux config, or service unit file. Use ASCII double-hyphen `--` or single hyphen `-` instead.
|
||||
- Never use curly quotes (`"` `"` `'` `'`) in shell scripts or configs. Use straight quotes `"` and `'`.
|
||||
- Never use the Unicode ellipsis (`…`). Use `...`.
|
||||
- GRUB `menuentry` and `submenu` titles must be ASCII-only — GRUB serial terminal output is ASCII; non-ASCII characters render as garbage or are dropped.
|
||||
- Comments in GRUB theme files (`.txt`) must also be ASCII-only, as GRUB may parse the entire file.
|
||||
|
||||
## Why
|
||||
|
||||
GRUB renders menus over both `gfxterm` (graphical, Unicode-capable) and `serial` (ASCII-only) simultaneously when `terminal_output gfxterm serial` is set. The serial output — used by IPMI SOL and BMC remote consoles — cannot display multi-byte UTF-8 sequences and shows raw bytes or drops characters. A menuentry title `"EASY-BEE — GSP=off"` appears as `"EASY-BEE â€" GSP=off"` or `"EASY-BEE GSP=off"` on SOL, making the menu unreadable.
|
||||
|
||||
## Anti-patterns
|
||||
|
||||
- `menuentry "EASY-BEE — GSP=off"` — em-dash in GRUB title
|
||||
- `# bee logo — centered` — em-dash in GRUB theme comment
|
||||
- `echo "done — reboot"` in a shell script displayed over serial
|
||||
|
||||
## Correct form
|
||||
|
||||
- `menuentry "EASY-BEE -- GSP=off"`
|
||||
- `# bee logo - centered`
|
||||
- `echo "done - reboot"`
|
||||
134
git-bible/grub-bitmap-error.md
Normal file
134
git-bible/grub-bitmap-error.md
Normal file
@@ -0,0 +1,134 @@
|
||||
# GRUB bitmap error: null src bitmap in grub_video_bitmap_create_scaled
|
||||
|
||||
## Symptom
|
||||
|
||||
```
|
||||
error: null src bitmap in grub_video_bitmap_create_scaled.
|
||||
Press any key to continue...
|
||||
```
|
||||
|
||||
Appears on boot before the GRUB menu renders. The menu still appears after pressing a key,
|
||||
but without the bee logo. Reproduced on real hardware (Lenovo SR650 V3, ASUS GPU servers).
|
||||
|
||||
## Root cause model
|
||||
|
||||
`grub_video_bitmap_create_scaled` receives a null `src` pointer, meaning the PNG loader
|
||||
returned null for `bee-logo.png`. GRUB calls this function even when no explicit
|
||||
`width`/`height` are set in `theme.txt` — it is invoked any time an image component is
|
||||
rendered, passing the image's natural dimensions as the target size.
|
||||
|
||||
The PNG file is referenced as `file = "bee-logo.png"` (relative to theme dir).
|
||||
GRUB resolves this to `/boot/grub/live-theme/bee-logo.png`.
|
||||
|
||||
## Attempts that did NOT fix the error
|
||||
|
||||
### Attempt 1 — add explicit `width`/`height` to image block (d52ec67)
|
||||
|
||||
**What was done:** First introduction of bee-logo.png with:
|
||||
```
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
width = 400
|
||||
height = 400
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
```
|
||||
PNG at this point was RGBA (color_type=6).
|
||||
|
||||
**Result:** Error appeared immediately on first ISO build.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 2 — remove `width`/`height` from image block (aa284ae)
|
||||
|
||||
**Hypothesis:** Explicit scaling dimensions trigger the scale path; removing them avoids it.
|
||||
|
||||
**What was done:** Removed `width = 400` and `height = 400` from the image block.
|
||||
```
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
```
|
||||
|
||||
**Result:** Error persists. GRUB calls `grub_video_bitmap_create_scaled` regardless of whether
|
||||
`width`/`height` are specified — if the bitmap is null (loading failed), the error fires either way.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 3 — convert PNG to RGBA + strip metadata chunks (6112094)
|
||||
|
||||
**Hypothesis:** GRUB's minimal PNG parser is confused by metadata chunks (cHRM, bKGD, tIME, tEXt).
|
||||
Also re-ordered `terminal_output gfxterm` before `insmod png` / theme load.
|
||||
|
||||
**What was done:**
|
||||
- Converted PNG to RGBA color_type=6, stripped all ancillary chunks
|
||||
- Moved `terminal_output gfxterm` earlier in config.cfg
|
||||
- Removed echo ASCII art banner from grub.cfg
|
||||
|
||||
**Result:** Error persists — and this change actually confirmed RGBA does not work:
|
||||
GRUB's PNG loader does not render RGBA PNGs correctly on this platform.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 4 — convert PNG from RGBA back to RGB (333c44f, most recent)
|
||||
|
||||
**Hypothesis:** GRUB does not support RGBA (color_type=6); RGB (color_type=2) is the correct format.
|
||||
Alpha channel composited onto black background (#000000) to match `desktop-color`.
|
||||
|
||||
**What was done:** Converted bee-logo.png from RGBA to RGB via ImageMagick.
|
||||
|
||||
**Current file state:**
|
||||
- 400×400 px, 8-bit/color RGB, non-interlaced
|
||||
- Only IHDR + IDAT + IEND chunks (no metadata)
|
||||
- `insmod png` is present in config.cfg
|
||||
- `terminal_output gfxterm` runs before theme is sourced
|
||||
- No explicit `width`/`height` in image block
|
||||
|
||||
**Result:** Error still occurs on real hardware. Despite the PNG being nominally correct
|
||||
(RGB, non-interlaced, minimal chunks), the bitmap load returns null.
|
||||
|
||||
## Confirmed root cause (verified on 172.16.41.94, 2026-04-30)
|
||||
|
||||
The EFI partition (`sda2`, vfat, 5 MB) contains only:
|
||||
```
|
||||
/EFI/boot/bootia32.efi
|
||||
/EFI/boot/bootx64.efi
|
||||
/EFI/boot/grubx64.efi
|
||||
/boot/grub/grub.cfg
|
||||
```
|
||||
|
||||
`config.cfg`, `theme.cfg`, and the entire `live-theme/` directory (including `bee-logo.png`)
|
||||
are **absent from the EFI image**. `live-build`'s `lb binary_grub-efi` stage is not
|
||||
copying these files. GRUB boots, sources only `grub.cfg`, then fails to load the theme
|
||||
because the file does not exist — returning a null bitmap regardless of PNG format.
|
||||
|
||||
All four fix attempts were targeting the wrong layer (PNG format/content).
|
||||
|
||||
## Fix (applied 2026-04-30)
|
||||
|
||||
Switched from PNG to TGA format:
|
||||
|
||||
1. Converted `bee-logo.png` → `bee-logo.tga` (24-bit uncompressed BGR, top-left origin,
|
||||
480018 bytes). Conversion done via Python stdlib (no external tools needed).
|
||||
2. `config.cfg`: `insmod png` → `insmod tga`
|
||||
3. `theme.txt`: `file = "bee-logo.png"` → `file = "bee-logo.tga"`
|
||||
|
||||
**Why TGA works:** GRUB's TGA reader (`tga.mod`) handles uncompressed 24-bit images
|
||||
trivially — no decompression, no complex chunk parsing. The module is present on-disk
|
||||
(`x86_64-efi/tga.mod`). PNG was failing despite a valid file; the exact GRUB bug is
|
||||
unknown but the PNG reader in Debian bookworm's grub2 is known to be fragile.
|
||||
|
||||
The old `bee-logo.png` is kept in the tree (may be useful for other tools) but is no
|
||||
longer referenced by the theme.
|
||||
|
||||
## Relevant files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `iso/builder/config/bootloaders/grub-efi/config.cfg` | insmod png, gfxterm init, theme source |
|
||||
| `iso/builder/config/bootloaders/grub-efi/theme.cfg` | sets `theme=` path |
|
||||
| `iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt` | image component definition |
|
||||
| `iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png` | the logo PNG |
|
||||
Submodule internal/chart updated: ac8120c8ab...2a15bc87f1
@@ -31,10 +31,10 @@ Build with explicit SSH keys baked into the ISO:
|
||||
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
|
||||
```
|
||||
|
||||
Rebuild the builder image:
|
||||
Force a clean rebuild of the builder image and build caches:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --rebuild-image
|
||||
sh iso/builder/build-in-container.sh --clean-build
|
||||
```
|
||||
|
||||
Use a custom cache directory:
|
||||
|
||||
@@ -10,7 +10,6 @@ IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
||||
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||
AUTH_KEYS=""
|
||||
REBUILD_IMAGE=0
|
||||
CLEAN_CACHE=0
|
||||
VARIANT="all"
|
||||
|
||||
@@ -22,17 +21,12 @@ while [ $# -gt 0 ]; do
|
||||
CACHE_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--rebuild-image)
|
||||
REBUILD_IMAGE=1
|
||||
shift
|
||||
;;
|
||||
--authorized-keys)
|
||||
AUTH_KEYS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--clean-build)
|
||||
CLEAN_CACHE=1
|
||||
REBUILD_IMAGE=1
|
||||
shift
|
||||
;;
|
||||
--variant)
|
||||
@@ -41,7 +35,7 @@ while [ $# -gt 0 ]; do
|
||||
;;
|
||||
*)
|
||||
echo "unknown arg: $1" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -105,7 +99,7 @@ image_matches_platform() {
|
||||
}
|
||||
|
||||
NEED_BUILD_IMAGE=0
|
||||
if [ "$REBUILD_IMAGE" = "1" ]; then
|
||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
|
||||
@@ -848,6 +848,73 @@ reset_live_build_stage() {
|
||||
done
|
||||
}
|
||||
|
||||
# Marker written after every successful full lb build for this variant
|
||||
FULL_BUILD_MARKER="${BUILD_WORK_DIR}/.bee-full-build-marker"
|
||||
|
||||
# Returns 0 if full lb build is needed, 1 if fast-path is safe.
|
||||
# Fast-path is safe when only light files changed since the last full build
|
||||
# (Go source, overlay scripts/configs). Heavy changes (VERSIONS, package lists,
|
||||
# hooks, archives, Dockerfile, auto/config) require a full lb build.
|
||||
needs_full_build() {
|
||||
[ -f "${FULL_BUILD_MARKER}" ] || return 0
|
||||
[ -f "${BUILD_WORK_DIR}/binary/live/filesystem.squashfs" ] || return 0
|
||||
[ -f "${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso" ] || return 0
|
||||
|
||||
_heavy=$(find \
|
||||
"${BUILDER_DIR}/VERSIONS" \
|
||||
"${BUILDER_DIR}/auto/config" \
|
||||
"${BUILDER_DIR}/Dockerfile" \
|
||||
"${BUILDER_DIR}/config/package-lists" \
|
||||
"${BUILDER_DIR}/config/hooks" \
|
||||
"${BUILDER_DIR}/config/archives" \
|
||||
"${BUILDER_DIR}/config/bootloaders" \
|
||||
-newer "${FULL_BUILD_MARKER}" 2>/dev/null | head -1)
|
||||
|
||||
if [ -n "$_heavy" ]; then
|
||||
echo "=== full build required: heavy config changed: $(basename "$_heavy") ==="
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
# Fast-path: unsquash existing filesystem, rsync overlay on top, repack.
|
||||
# Requires ~10 GB free in BEE_CACHE_DIR for the unpacked squashfs.
|
||||
fast_path_repack_squashfs() {
|
||||
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
||||
_tmp="${BEE_CACHE_DIR}/fast-unsquash-${BUILD_VARIANT}"
|
||||
echo "=== fast-path: unsquash ($(du -sh "$_sq" | cut -f1) compressed) ==="
|
||||
rm -rf "$_tmp"
|
||||
unsquashfs -d "$_tmp" "$_sq"
|
||||
echo "=== fast-path: syncing overlay stage ==="
|
||||
rsync -a --checksum "${OVERLAY_STAGE_DIR}/" "$_tmp/"
|
||||
echo "=== fast-path: repacking squashfs ==="
|
||||
_sq_new="${_sq}.new"
|
||||
rm -f "$_sq_new"
|
||||
mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress
|
||||
mv "$_sq_new" "$_sq"
|
||||
rm -rf "$_tmp"
|
||||
echo "=== fast-path: squashfs repacked ($(du -sh "$_sq" | cut -f1)) ==="
|
||||
}
|
||||
|
||||
# Fast-path: rebuild ISO by replacing only live/filesystem.squashfs via xorriso.
|
||||
# Boot structure (El Torito, EFI, MBR hybrid) is replayed from the prior ISO.
|
||||
fast_path_rebuild_iso() {
|
||||
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
||||
_prior="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso"
|
||||
_new="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso.new"
|
||||
echo "=== fast-path: rebuilding ISO with xorriso ==="
|
||||
rm -f "$_new"
|
||||
xorriso \
|
||||
-indev "$_prior" \
|
||||
-outdev "$_new" \
|
||||
-map "$_sq" /live/filesystem.squashfs \
|
||||
-boot_image any replay \
|
||||
-commit
|
||||
mv "$_new" "$_prior"
|
||||
echo "=== fast-path: ISO rebuilt ==="
|
||||
}
|
||||
|
||||
recover_iso_memtest() {
|
||||
lb_dir="$1"
|
||||
iso_path="$2"
|
||||
@@ -1487,6 +1554,21 @@ if [ -f "${LB_INCLUDES}/root/.ssh/authorized_keys" ]; then
|
||||
chmod 600 "${LB_INCLUDES}/root/.ssh/authorized_keys"
|
||||
fi
|
||||
|
||||
# --- auto fast-path: squashfs surgery if only light files changed ---
|
||||
if ! needs_full_build; then
|
||||
echo "=== fast-path build (no heavy config changes since last full build) ==="
|
||||
fast_path_repack_squashfs
|
||||
fast_path_rebuild_iso
|
||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||
validate_iso_live_boot_entries "$ISO_RAW"
|
||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
echo "=== done (${BUILD_VARIANT}, fast-path) ==="
|
||||
echo "ISO: $ISO_OUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- build ISO using live-build ---
|
||||
echo ""
|
||||
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||
@@ -1535,6 +1617,7 @@ if [ -f "$ISO_RAW" ]; then
|
||||
validate_iso_live_boot_entries "$ISO_RAW"
|
||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
touch "${FULL_BUILD_MARKER}"
|
||||
echo ""
|
||||
echo "=== done (${BUILD_VARIANT}) ==="
|
||||
echo "ISO: $ISO_OUT"
|
||||
|
||||
@@ -27,5 +27,5 @@ insmod gfxterm
|
||||
terminal_input console serial
|
||||
terminal_output gfxterm serial
|
||||
|
||||
insmod png
|
||||
insmod tga
|
||||
source /boot/grub/theme.cfg
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 78 KiB After Width: | Height: | Size: 77 KiB |
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.tga
Normal file
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.tga
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 469 KiB |
@@ -9,7 +9,7 @@ terminal-font: "Unifont Regular 16"
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
file = "bee-logo.png"
|
||||
file = "bee-logo.tga"
|
||||
}
|
||||
|
||||
#help bar at the bottom
|
||||
|
||||
@@ -31,6 +31,7 @@ systemctl enable bee-preflight.service
|
||||
systemctl enable bee-audit.service
|
||||
systemctl enable bee-web.service
|
||||
systemctl enable bee-sshsetup.service
|
||||
systemctl enable bee-blackbox.service
|
||||
systemctl enable bee-selfheal.timer
|
||||
systemctl enable bee-boot-status.service
|
||||
systemctl enable ssh.service
|
||||
|
||||
@@ -47,18 +47,27 @@ vim-tiny
|
||||
mc
|
||||
htop
|
||||
nvtop
|
||||
btop
|
||||
sudo
|
||||
zstd
|
||||
mstflint
|
||||
memtester
|
||||
stress-ng
|
||||
stressapptest
|
||||
|
||||
# QR codes (for displaying audit results)
|
||||
qrencode
|
||||
fio
|
||||
iperf3
|
||||
iotop
|
||||
nload
|
||||
tcpdump
|
||||
hdparm
|
||||
sysstat
|
||||
lsscsi
|
||||
sg3-utils
|
||||
jq
|
||||
curl
|
||||
net-tools
|
||||
|
||||
# Local desktop (openbox + chromium kiosk)
|
||||
gparted
|
||||
openbox
|
||||
tint2
|
||||
feh
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: hardware audit
|
||||
After=bee-preflight.service bee-network.service bee-nvidia.service
|
||||
After=bee-preflight.service bee-network.service bee-nvidia.service bee-blackbox.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
|
||||
18
iso/overlay/etc/systemd/system/bee-blackbox.service
Normal file
18
iso/overlay/etc/systemd/system/bee-blackbox.service
Normal file
@@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=Bee: USB black-box log mirror
|
||||
After=local-fs.target
|
||||
Before=bee-network.service bee-nvidia.service bee-preflight.service bee-audit.service bee-web.service
|
||||
StartLimitIntervalSec=0
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-blackbox.log /usr/local/bin/bee blackbox --export-dir /appdata/bee/export --state-file /appdata/bee/export/blackbox-state.json
|
||||
Restart=always
|
||||
RestartSec=1
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
OOMScoreAdjust=-900
|
||||
Nice=0
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -1,6 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: bring up network interfaces via DHCP
|
||||
After=local-fs.target
|
||||
After=local-fs.target bee-blackbox.service
|
||||
Before=network-online.target bee-audit.service
|
||||
|
||||
[Service]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: load NVIDIA kernel modules and create device nodes
|
||||
After=local-fs.target udev.service
|
||||
After=local-fs.target udev.service bee-blackbox.service
|
||||
Before=bee-audit.service
|
||||
|
||||
[Service]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: runtime preflight self-check
|
||||
After=bee-network.service bee-nvidia.service
|
||||
After=bee-network.service bee-nvidia.service bee-blackbox.service
|
||||
Before=bee-audit.service
|
||||
|
||||
[Service]
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: hardware audit web viewer
|
||||
After=bee-blackbox.service
|
||||
StartLimitIntervalSec=0
|
||||
|
||||
[Service]
|
||||
|
||||
BIN
iso/vendor/arcconf
vendored
Executable file
BIN
iso/vendor/arcconf
vendored
Executable file
Binary file not shown.
BIN
iso/vendor/sas2ircu
vendored
Executable file
BIN
iso/vendor/sas2ircu
vendored
Executable file
Binary file not shown.
BIN
iso/vendor/sas3ircu
vendored
Executable file
BIN
iso/vendor/sas3ircu
vendored
Executable file
Binary file not shown.
BIN
iso/vendor/ssacli
vendored
Executable file
BIN
iso/vendor/ssacli
vendored
Executable file
Binary file not shown.
BIN
iso/vendor/storcli64
vendored
Executable file
BIN
iso/vendor/storcli64
vendored
Executable file
Binary file not shown.
@@ -1,74 +0,0 @@
|
||||
#!/bin/sh
|
||||
# fetch-vendor.sh — download proprietary vendor utilities into iso/vendor.
|
||||
#
|
||||
# Usage:
|
||||
# STORCLI_URL=... STORCLI_SHA256=... \
|
||||
# SAS2IRCU_URL=... SAS2IRCU_SHA256=... \
|
||||
# SAS3IRCU_URL=... SAS3IRCU_SHA256=... \
|
||||
# MSTFLINT_URL=... MSTFLINT_SHA256=... \
|
||||
# sh scripts/fetch-vendor.sh
|
||||
|
||||
set -eu
|
||||
|
||||
ROOT_DIR=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)
|
||||
OUT_DIR="$ROOT_DIR/iso/vendor"
|
||||
mkdir -p "$OUT_DIR"
|
||||
|
||||
need_cmd() {
|
||||
command -v "$1" >/dev/null 2>&1 || { echo "ERROR: required command not found: $1" >&2; exit 1; }
|
||||
}
|
||||
|
||||
need_cmd sha256sum
|
||||
|
||||
download_to() {
|
||||
url="$1"
|
||||
out="$2"
|
||||
if command -v wget >/dev/null 2>&1; then
|
||||
wget -O "$out" "$url"
|
||||
return 0
|
||||
fi
|
||||
if command -v curl >/dev/null 2>&1; then
|
||||
curl -fsSL "$url" -o "$out"
|
||||
return 0
|
||||
fi
|
||||
echo "ERROR: required command not found: wget or curl" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
fetch_one() {
|
||||
name="$1"
|
||||
url="$2"
|
||||
sha="$3"
|
||||
|
||||
if [ -z "$url" ] || [ -z "$sha" ]; then
|
||||
echo "[vendor] skip $name (URL/SHA not provided)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
dst="$OUT_DIR/$name"
|
||||
tmp="$dst.tmp"
|
||||
|
||||
echo "[vendor] downloading $name"
|
||||
download_to "$url" "$tmp"
|
||||
|
||||
got=$(sha256sum "$tmp" | awk '{print $1}')
|
||||
want=$(echo "$sha" | tr '[:upper:]' '[:lower:]')
|
||||
if [ "$got" != "$want" ]; then
|
||||
rm -f "$tmp"
|
||||
echo "ERROR: checksum mismatch for $name" >&2
|
||||
echo " got: $got" >&2
|
||||
echo " want: $want" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mv "$tmp" "$dst"
|
||||
chmod +x "$dst" || true
|
||||
echo "[vendor] ok: $name"
|
||||
}
|
||||
|
||||
fetch_one "storcli64" "${STORCLI_URL:-}" "${STORCLI_SHA256:-}"
|
||||
fetch_one "sas2ircu" "${SAS2IRCU_URL:-}" "${SAS2IRCU_SHA256:-}"
|
||||
fetch_one "sas3ircu" "${SAS3IRCU_URL:-}" "${SAS3IRCU_SHA256:-}"
|
||||
fetch_one "mstflint" "${MSTFLINT_URL:-}" "${MSTFLINT_SHA256:-}"
|
||||
|
||||
echo "[vendor] done. output dir: $OUT_DIR"
|
||||
Reference in New Issue
Block a user