Compare commits
43 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
58d6da0e4f | ||
|
|
7ce73e34a4 | ||
|
|
8a21809ade | ||
|
|
626763e31d | ||
|
|
0b8a2ff83f | ||
|
|
2c22b01fe3 | ||
|
|
ec89616585 | ||
|
|
c0dbbf96ad | ||
|
|
76484b123c | ||
|
|
8901596152 | ||
|
|
7c504e5056 | ||
|
|
333c44f3ba | ||
|
|
3bca821d3e | ||
|
|
3648e37a1e | ||
|
|
d109e08fab | ||
|
|
11d00b9442 | ||
|
|
6defa5ae15 | ||
|
|
c76658ed00 | ||
|
|
2163017a98 | ||
| 29179917c3 | |||
| be4b439804 | |||
| 749fc8a94d | |||
| 6112094d45 | |||
| e9a2bc9f9d | |||
|
|
7a8f884664 | ||
|
|
8bf8dfa45b | ||
|
|
6a22199aff | ||
|
|
ddb2bb5d1c | ||
|
|
aa284ae754 | ||
|
|
8512098174 | ||
|
|
6b5d22c194 | ||
|
|
a35e90a93e | ||
|
|
1ced81707f | ||
|
|
679aeb9947 | ||
|
|
647e99b697 | ||
|
|
4af997f436 | ||
|
|
6caace0cc0 | ||
|
|
5f0103635b | ||
|
|
84a2551dc0 | ||
|
|
1cfabc9230 | ||
|
|
5dc711de23 | ||
|
|
ab802719f8 | ||
|
|
a94e8007f8 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -3,3 +3,4 @@
|
|||||||
dist/
|
dist/
|
||||||
iso/out/
|
iso/out/
|
||||||
build-cache/
|
build-cache/
|
||||||
|
audit/bee
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@@ -67,10 +68,14 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
|||||||
return runSupportBundle(args[1:], stdout, stderr)
|
return runSupportBundle(args[1:], stdout, stderr)
|
||||||
case "web":
|
case "web":
|
||||||
return runWeb(args[1:], stdout, stderr)
|
return runWeb(args[1:], stdout, stderr)
|
||||||
|
case "blackbox":
|
||||||
|
return runBlackbox(args[1:], stdout, stderr)
|
||||||
case "sat":
|
case "sat":
|
||||||
return runSAT(args[1:], stdout, stderr)
|
return runSAT(args[1:], stdout, stderr)
|
||||||
case "benchmark":
|
case "benchmark":
|
||||||
return runBenchmark(args[1:], stdout, stderr)
|
return runBenchmark(args[1:], stdout, stderr)
|
||||||
|
case "bee-worker":
|
||||||
|
return runBeeWorker(args[1:], stdout, stderr)
|
||||||
case "version", "--version", "-version":
|
case "version", "--version", "-version":
|
||||||
fmt.Fprintln(stdout, Version)
|
fmt.Fprintln(stdout, Version)
|
||||||
return 0
|
return 0
|
||||||
@@ -88,8 +93,10 @@ func printRootUsage(w io.Writer) {
|
|||||||
bee export --target <device>
|
bee export --target <device>
|
||||||
bee support-bundle --output stdout|file:<path>
|
bee support-bundle --output stdout|file:<path>
|
||||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||||
|
bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
|
||||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||||
bee benchmark nvidia [--profile standard|stability|overnight]
|
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||||
|
bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
|
||||||
bee version
|
bee version
|
||||||
bee help [command]`)
|
bee help [command]`)
|
||||||
}
|
}
|
||||||
@@ -106,10 +113,14 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
|||||||
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||||
case "web":
|
case "web":
|
||||||
return runWeb([]string{"--help"}, stdout, stdout)
|
return runWeb([]string{"--help"}, stdout, stdout)
|
||||||
|
case "blackbox":
|
||||||
|
return runBlackbox([]string{"--help"}, stdout, stdout)
|
||||||
case "sat":
|
case "sat":
|
||||||
return runSAT([]string{"--help"}, stdout, stderr)
|
return runSAT([]string{"--help"}, stdout, stderr)
|
||||||
case "benchmark":
|
case "benchmark":
|
||||||
return runBenchmark([]string{"--help"}, stdout, stderr)
|
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||||
|
case "bee-worker":
|
||||||
|
return runBeeWorker([]string{"--help"}, stdout, stderr)
|
||||||
case "version":
|
case "version":
|
||||||
fmt.Fprintln(stdout, "usage: bee version")
|
fmt.Fprintln(stdout, "usage: bee version")
|
||||||
return 0
|
return 0
|
||||||
@@ -335,6 +346,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runBlackbox(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||||
|
statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
|
||||||
|
if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
|
||||||
|
slog.Error("run blackbox", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func runSAT(args []string, stdout, stderr io.Writer) int {
|
func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||||
if len(args) == 0 {
|
if len(args) == 0 {
|
||||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||||
@@ -462,6 +500,28 @@ func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runBeeWorker(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
|
||||||
|
taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
|
||||||
|
}
|
||||||
|
|
||||||
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||||
raw = strings.TrimSpace(raw)
|
raw = strings.TrimSpace(raw)
|
||||||
if raw == "" {
|
if raw == "" {
|
||||||
|
|||||||
779
audit/internal/app/blackbox.go
Normal file
779
audit/internal/app/blackbox.go
Normal file
@@ -0,0 +1,779 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"crypto/rand"
|
||||||
|
"encoding/hex"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io/fs"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
blackboxMarkerName = ".bee-blackbox"
|
||||||
|
blackboxDiscoverInterval = 2 * time.Second
|
||||||
|
blackboxMinFlushPeriod = 1 * time.Second
|
||||||
|
blackboxMaxFlushPeriod = 30 * time.Second
|
||||||
|
blackboxRecoveryFastCount = 5
|
||||||
|
)
|
||||||
|
|
||||||
|
var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
|
||||||
|
|
||||||
|
var (
|
||||||
|
blackboxExecCommand = exec.Command
|
||||||
|
blackboxNow = func() time.Time { return time.Now().UTC() }
|
||||||
|
)
|
||||||
|
|
||||||
|
type BlackboxMarker struct {
|
||||||
|
Version int `json:"version"`
|
||||||
|
EnrollmentID string `json:"enrollment_id"`
|
||||||
|
CreatedAtUTC string `json:"created_at_utc"`
|
||||||
|
Host string `json:"host,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BlackboxTargetStatus struct {
|
||||||
|
EnrollmentID string `json:"enrollment_id"`
|
||||||
|
Device string `json:"device"`
|
||||||
|
FS platform.RemovableTarget `json:"fs"`
|
||||||
|
BootFolder string `json:"boot_folder"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
LastSyncAtUTC string `json:"last_sync_at_utc,omitempty"`
|
||||||
|
LastCycleDuration string `json:"last_cycle_duration,omitempty"`
|
||||||
|
FlushPeriod string `json:"flush_period"`
|
||||||
|
LastError string `json:"last_error,omitempty"`
|
||||||
|
Mountpoint string `json:"mountpoint,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BlackboxState struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
BootStartedAtUTC string `json:"boot_started_at_utc"`
|
||||||
|
BootFolder string `json:"boot_folder"`
|
||||||
|
UpdatedAtUTC string `json:"updated_at_utc"`
|
||||||
|
Targets []BlackboxTargetStatus `json:"targets"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type blackboxRuntime struct {
|
||||||
|
exportDir string
|
||||||
|
statePath string
|
||||||
|
system *platform.System
|
||||||
|
bootStarted time.Time
|
||||||
|
bootFolder string
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
workers map[string]*blackboxWorker
|
||||||
|
}
|
||||||
|
|
||||||
|
type discoveredBlackboxTarget struct {
|
||||||
|
marker BlackboxMarker
|
||||||
|
target platform.RemovableTarget
|
||||||
|
seenMount string
|
||||||
|
mountedByBee bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type blackboxWorker struct {
|
||||||
|
runtime *blackboxRuntime
|
||||||
|
enrollmentID string
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
target platform.RemovableTarget
|
||||||
|
marker BlackboxMarker
|
||||||
|
mountpoint string
|
||||||
|
mountedByBee bool
|
||||||
|
status string
|
||||||
|
lastSyncAt time.Time
|
||||||
|
lastDuration time.Duration
|
||||||
|
flushPeriod time.Duration
|
||||||
|
lastError string
|
||||||
|
fastCycles int
|
||||||
|
stopCh chan struct{}
|
||||||
|
stoppedCh chan struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
|
||||||
|
exportDir = strings.TrimSpace(exportDir)
|
||||||
|
if exportDir == "" {
|
||||||
|
exportDir = DefaultExportDir
|
||||||
|
}
|
||||||
|
statePath = strings.TrimSpace(statePath)
|
||||||
|
if statePath == "" {
|
||||||
|
statePath = DefaultBlackboxStatePath
|
||||||
|
}
|
||||||
|
if system == nil {
|
||||||
|
system = platform.New()
|
||||||
|
}
|
||||||
|
bootStarted, err := bootStartedAtUTC()
|
||||||
|
if err != nil {
|
||||||
|
bootStarted = blackboxNow()
|
||||||
|
}
|
||||||
|
rt := &blackboxRuntime{
|
||||||
|
exportDir: exportDir,
|
||||||
|
statePath: statePath,
|
||||||
|
system: system,
|
||||||
|
bootStarted: bootStarted,
|
||||||
|
bootFolder: SupportBundleBaseName(bootStarted),
|
||||||
|
workers: make(map[string]*blackboxWorker),
|
||||||
|
}
|
||||||
|
_ = os.MkdirAll(filepath.Dir(statePath), 0755)
|
||||||
|
rt.persistState()
|
||||||
|
ticker := time.NewTicker(blackboxDiscoverInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
rt.reconcile()
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
rt.stopAll()
|
||||||
|
return ctx.Err()
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ReadBlackboxState(path string) (BlackboxState, error) {
|
||||||
|
path = strings.TrimSpace(path)
|
||||||
|
if path == "" {
|
||||||
|
path = DefaultBlackboxStatePath
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return BlackboxState{}, err
|
||||||
|
}
|
||||||
|
var state BlackboxState
|
||||||
|
if err := json.Unmarshal(raw, &state); err != nil {
|
||||||
|
return BlackboxState{}, err
|
||||||
|
}
|
||||||
|
return state, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
|
||||||
|
target = sanitizeRemovableTarget(target)
|
||||||
|
if target.Device == "" {
|
||||||
|
return BlackboxMarker{}, fmt.Errorf("device is required")
|
||||||
|
}
|
||||||
|
mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
|
||||||
|
if err != nil {
|
||||||
|
return BlackboxMarker{}, err
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if mountedByBee {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
marker, _, err := readBlackboxMarker(mountpoint)
|
||||||
|
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
|
return BlackboxMarker{}, err
|
||||||
|
}
|
||||||
|
if marker.EnrollmentID == "" {
|
||||||
|
marker = BlackboxMarker{
|
||||||
|
Version: 1,
|
||||||
|
EnrollmentID: newBlackboxEnrollmentID(),
|
||||||
|
CreatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||||
|
Host: hostnameOr("unknown"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := writeBlackboxMarker(mountpoint, marker); err != nil {
|
||||||
|
return BlackboxMarker{}, err
|
||||||
|
}
|
||||||
|
return marker, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func DisableBlackboxTarget(device, enrollmentID string) error {
|
||||||
|
device = strings.TrimSpace(device)
|
||||||
|
enrollmentID = strings.TrimSpace(enrollmentID)
|
||||||
|
if device == "" && enrollmentID == "" {
|
||||||
|
return fmt.Errorf("device or enrollment_id is required")
|
||||||
|
}
|
||||||
|
system := platform.New()
|
||||||
|
targets, err := system.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, target := range targets {
|
||||||
|
target = sanitizeRemovableTarget(target)
|
||||||
|
mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
|
||||||
|
if mountErr != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
remove := false
|
||||||
|
marker, _, err := readBlackboxMarker(mountpoint)
|
||||||
|
if err == nil {
|
||||||
|
if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
|
||||||
|
remove = true
|
||||||
|
}
|
||||||
|
if device != "" && target.Device == device {
|
||||||
|
remove = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if remove {
|
||||||
|
err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
|
||||||
|
}
|
||||||
|
if mountedByBee {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
if remove {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return os.ErrNotExist
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) reconcile() {
|
||||||
|
discovered, _ := rt.discoverMarkedTargets()
|
||||||
|
|
||||||
|
rt.mu.Lock()
|
||||||
|
defer rt.mu.Unlock()
|
||||||
|
|
||||||
|
seen := make(map[string]struct{}, len(discovered))
|
||||||
|
for _, found := range discovered {
|
||||||
|
seen[found.marker.EnrollmentID] = struct{}{}
|
||||||
|
worker, ok := rt.workers[found.marker.EnrollmentID]
|
||||||
|
if !ok {
|
||||||
|
worker = newBlackboxWorker(rt, found)
|
||||||
|
rt.workers[found.marker.EnrollmentID] = worker
|
||||||
|
go worker.run()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
worker.update(found)
|
||||||
|
}
|
||||||
|
for id, worker := range rt.workers {
|
||||||
|
if _, ok := seen[id]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
worker.stop()
|
||||||
|
delete(rt.workers, id)
|
||||||
|
}
|
||||||
|
rt.persistStateLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) stopAll() {
|
||||||
|
rt.mu.Lock()
|
||||||
|
workers := make([]*blackboxWorker, 0, len(rt.workers))
|
||||||
|
for _, worker := range rt.workers {
|
||||||
|
workers = append(workers, worker)
|
||||||
|
}
|
||||||
|
rt.workers = map[string]*blackboxWorker{}
|
||||||
|
rt.persistStateLocked()
|
||||||
|
rt.mu.Unlock()
|
||||||
|
for _, worker := range workers {
|
||||||
|
worker.stop()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
|
||||||
|
targets, err := rt.system.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var out []discoveredBlackboxTarget
|
||||||
|
for _, rawTarget := range targets {
|
||||||
|
target := sanitizeRemovableTarget(rawTarget)
|
||||||
|
if target.Device == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
marker, ok, err := readBlackboxMarker(mountpoint)
|
||||||
|
if mountedByBee && !ok {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
if err != nil || !ok || marker.EnrollmentID == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if mountedByBee {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
out = append(out, discoveredBlackboxTarget{
|
||||||
|
marker: marker,
|
||||||
|
target: target,
|
||||||
|
seenMount: mountpoint,
|
||||||
|
mountedByBee: mountedByBee,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(out, func(i, j int) bool {
|
||||||
|
return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
|
||||||
|
})
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
|
||||||
|
return &blackboxWorker{
|
||||||
|
runtime: rt,
|
||||||
|
enrollmentID: found.marker.EnrollmentID,
|
||||||
|
target: found.target,
|
||||||
|
marker: found.marker,
|
||||||
|
flushPeriod: blackboxMinFlushPeriod,
|
||||||
|
status: "running",
|
||||||
|
stopCh: make(chan struct{}),
|
||||||
|
stoppedCh: make(chan struct{}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) run() {
|
||||||
|
defer close(w.stoppedCh)
|
||||||
|
for {
|
||||||
|
start := time.Now()
|
||||||
|
err := w.syncCycle()
|
||||||
|
duration := time.Since(start)
|
||||||
|
w.finishCycle(duration, err)
|
||||||
|
|
||||||
|
wait := w.currentFlushPeriod()
|
||||||
|
timer := time.NewTimer(wait)
|
||||||
|
select {
|
||||||
|
case <-w.stopCh:
|
||||||
|
timer.Stop()
|
||||||
|
w.cleanup()
|
||||||
|
return
|
||||||
|
case <-timer.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
w.target = found.target
|
||||||
|
w.marker = found.marker
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) stop() {
|
||||||
|
select {
|
||||||
|
case <-w.stopCh:
|
||||||
|
default:
|
||||||
|
close(w.stopCh)
|
||||||
|
}
|
||||||
|
<-w.stoppedCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
return w.flushPeriod
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
w.lastDuration = duration
|
||||||
|
if err != nil {
|
||||||
|
w.status = "degraded"
|
||||||
|
w.lastError = err.Error()
|
||||||
|
w.fastCycles = 0
|
||||||
|
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
|
||||||
|
} else {
|
||||||
|
w.status = "running"
|
||||||
|
w.lastSyncAt = blackboxNow()
|
||||||
|
w.lastError = ""
|
||||||
|
if duration <= w.flushPeriod/2 {
|
||||||
|
w.fastCycles++
|
||||||
|
} else {
|
||||||
|
w.fastCycles = 0
|
||||||
|
}
|
||||||
|
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||||
|
}
|
||||||
|
w.runtime.persistState()
|
||||||
|
}
|
||||||
|
|
||||||
|
func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
|
||||||
|
if current <= 0 {
|
||||||
|
current = blackboxMinFlushPeriod
|
||||||
|
}
|
||||||
|
if duration <= 0 {
|
||||||
|
duration = current
|
||||||
|
}
|
||||||
|
next := current
|
||||||
|
if duration > current {
|
||||||
|
growA := time.Duration(float64(current) * 1.25)
|
||||||
|
growB := time.Duration(float64(duration) * 1.25)
|
||||||
|
if growB > growA {
|
||||||
|
next = growB
|
||||||
|
} else {
|
||||||
|
next = growA
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if success && fastCycles >= blackboxRecoveryFastCount {
|
||||||
|
next = time.Duration(float64(current) * 0.9)
|
||||||
|
}
|
||||||
|
if next < blackboxMinFlushPeriod {
|
||||||
|
next = blackboxMinFlushPeriod
|
||||||
|
}
|
||||||
|
if next > blackboxMaxFlushPeriod {
|
||||||
|
next = blackboxMaxFlushPeriod
|
||||||
|
}
|
||||||
|
return next
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) syncCycle() error {
|
||||||
|
target, marker := w.snapshotTarget()
|
||||||
|
mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
w.recordMountpoint(mountpoint, mountedByBee)
|
||||||
|
|
||||||
|
root := filepath.Join(mountpoint, w.runtime.bootFolder)
|
||||||
|
if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := w.captureSnapshots(root); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return syncFilesystem(root)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) cleanup() {
|
||||||
|
w.mu.Lock()
|
||||||
|
mountpoint := w.mountpoint
|
||||||
|
mountedByBee := w.mountedByBee
|
||||||
|
w.mu.Unlock()
|
||||||
|
if mountedByBee && mountpoint != "" {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
return w.target, w.marker
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
w.mountpoint = mountpoint
|
||||||
|
w.mountedByBee = mountedByBee
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) captureSnapshots(root string) error {
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, svc := range supportBundleServices {
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, item := range supportBundleOptionalFiles {
|
||||||
|
if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) persistState() {
|
||||||
|
rt.mu.Lock()
|
||||||
|
defer rt.mu.Unlock()
|
||||||
|
rt.persistStateLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) persistStateLocked() {
|
||||||
|
state := BlackboxState{
|
||||||
|
Status: "disabled",
|
||||||
|
BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
|
||||||
|
BootFolder: rt.bootFolder,
|
||||||
|
UpdatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||||
|
Targets: make([]BlackboxTargetStatus, 0, len(rt.workers)),
|
||||||
|
}
|
||||||
|
if len(rt.workers) > 0 {
|
||||||
|
state.Status = "running"
|
||||||
|
}
|
||||||
|
for _, worker := range rt.workers {
|
||||||
|
worker.mu.Lock()
|
||||||
|
targetState := BlackboxTargetStatus{
|
||||||
|
EnrollmentID: worker.enrollmentID,
|
||||||
|
Device: worker.target.Device,
|
||||||
|
FS: worker.target,
|
||||||
|
BootFolder: rt.bootFolder,
|
||||||
|
Status: worker.status,
|
||||||
|
FlushPeriod: worker.flushPeriod.String(),
|
||||||
|
LastError: worker.lastError,
|
||||||
|
Mountpoint: worker.mountpoint,
|
||||||
|
}
|
||||||
|
if !worker.lastSyncAt.IsZero() {
|
||||||
|
targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
|
||||||
|
}
|
||||||
|
if worker.lastDuration > 0 {
|
||||||
|
targetState.LastCycleDuration = worker.lastDuration.String()
|
||||||
|
}
|
||||||
|
if worker.status == "degraded" {
|
||||||
|
state.Status = "degraded"
|
||||||
|
}
|
||||||
|
worker.mu.Unlock()
|
||||||
|
state.Targets = append(state.Targets, targetState)
|
||||||
|
}
|
||||||
|
sort.Slice(state.Targets, func(i, j int) bool {
|
||||||
|
return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
|
||||||
|
})
|
||||||
|
_ = writeJSONAtomic(rt.statePath, state)
|
||||||
|
}
|
||||||
|
|
||||||
|
func bootStartedAtUTC() (time.Time, error) {
|
||||||
|
raw, err := os.ReadFile("/proc/stat")
|
||||||
|
if err != nil {
|
||||||
|
return time.Time{}, err
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(line, "btime ") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Fields(line)
|
||||||
|
if len(parts) != 2 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
sec, err := time.ParseDuration(parts[1] + "s")
|
||||||
|
if err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
return time.Unix(int64(sec/time.Second), 0).UTC(), nil
|
||||||
|
}
|
||||||
|
return time.Time{}, fmt.Errorf("boot time not found")
|
||||||
|
}
|
||||||
|
|
||||||
|
func newBlackboxEnrollmentID() string {
|
||||||
|
var buf [8]byte
|
||||||
|
if _, err := rand.Read(buf[:]); err != nil {
|
||||||
|
return fmt.Sprintf("bb-%d", time.Now().UnixNano())
|
||||||
|
}
|
||||||
|
return "bb-" + hex.EncodeToString(buf[:])
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
|
||||||
|
target.Device = strings.TrimSpace(target.Device)
|
||||||
|
target.FSType = strings.TrimSpace(target.FSType)
|
||||||
|
target.Size = strings.TrimSpace(target.Size)
|
||||||
|
target.Label = strings.TrimSpace(target.Label)
|
||||||
|
target.Model = strings.TrimSpace(target.Model)
|
||||||
|
target.Mountpoint = strings.TrimSpace(target.Mountpoint)
|
||||||
|
return target
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
|
||||||
|
target = sanitizeRemovableTarget(target)
|
||||||
|
if target.Mountpoint != "" {
|
||||||
|
if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
|
||||||
|
return target.Mountpoint, false, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
|
||||||
|
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||||
|
return "", false, err
|
||||||
|
}
|
||||||
|
if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||||
|
return "", false, formatBlackboxMountTargetError(target, string(raw), err)
|
||||||
|
}
|
||||||
|
if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
return "", false, err
|
||||||
|
}
|
||||||
|
return mountpoint, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func unmountTarget(mountpoint string) error {
|
||||||
|
_ = blackboxExecCommand("sync").Run()
|
||||||
|
raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
msg := strings.TrimSpace(string(raw))
|
||||||
|
if msg == "" {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%s: %w", msg, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
|
||||||
|
raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
return BlackboxMarker{}, false, os.ErrNotExist
|
||||||
|
}
|
||||||
|
return BlackboxMarker{}, false, err
|
||||||
|
}
|
||||||
|
var marker BlackboxMarker
|
||||||
|
if err := json.Unmarshal(raw, &marker); err != nil {
|
||||||
|
return BlackboxMarker{}, false, err
|
||||||
|
}
|
||||||
|
return marker, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
|
||||||
|
if marker.Version == 0 {
|
||||||
|
marker.Version = 1
|
||||||
|
}
|
||||||
|
return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
|
||||||
|
}
|
||||||
|
|
||||||
|
func syncDirectoryTree(srcDir, dstDir string) error {
|
||||||
|
seen := make(map[string]struct{})
|
||||||
|
err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(srcDir, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel = filepath.Clean(rel)
|
||||||
|
if rel == "." {
|
||||||
|
seen["."] = struct{}{}
|
||||||
|
return os.MkdirAll(dstDir, 0755)
|
||||||
|
}
|
||||||
|
seen[rel] = struct{}{}
|
||||||
|
dstPath := filepath.Join(dstDir, rel)
|
||||||
|
if d.IsDir() {
|
||||||
|
info, err := d.Info()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.MkdirAll(dstPath, info.Mode().Perm())
|
||||||
|
}
|
||||||
|
return copyFileIfChanged(path, dstPath)
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return removeMissingPaths(dstDir, seen)
|
||||||
|
}
|
||||||
|
|
||||||
|
func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
|
||||||
|
return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(dstDir, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel = filepath.Clean(rel)
|
||||||
|
if rel == "." {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if _, ok := seen[rel]; ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return os.RemoveAll(path)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyFileIfChanged(src, dst string) error {
|
||||||
|
info, err := os.Stat(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
return os.MkdirAll(dst, info.Mode().Perm())
|
||||||
|
}
|
||||||
|
srcData, err := os.ReadFile(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return writeFileAtomic(dst, srcData, info.Mode().Perm())
|
||||||
|
}
|
||||||
|
|
||||||
|
func captureCommandAtomic(dst string, name string, args ...string) error {
|
||||||
|
raw, err := blackboxExecCommand(name, args...).CombinedOutput()
|
||||||
|
if len(raw) == 0 {
|
||||||
|
if err != nil {
|
||||||
|
raw = []byte(err.Error() + "\n")
|
||||||
|
} else {
|
||||||
|
raw = []byte("no output\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return writeFileAtomic(dst, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeJSONAtomic(path string, v any) error {
|
||||||
|
raw, err := json.MarshalIndent(v, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
raw = append(raw, '\n')
|
||||||
|
return writeFileAtomic(path, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
tmp := path + ".tmp"
|
||||||
|
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if _, err := f.Write(data); err != nil {
|
||||||
|
_ = f.Close()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := f.Sync(); err != nil {
|
||||||
|
_ = f.Close()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := f.Close(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := os.Rename(tmp, path); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return syncFilesystem(filepath.Dir(path))
|
||||||
|
}
|
||||||
|
|
||||||
|
func syncFilesystem(path string) error {
|
||||||
|
return blackboxExecCommand("sync").Run()
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureWritableBlackboxMountpoint(mountpoint string) error {
|
||||||
|
probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||||
|
}
|
||||||
|
name := probe.Name()
|
||||||
|
if closeErr := probe.Close(); closeErr != nil {
|
||||||
|
_ = os.Remove(name)
|
||||||
|
return closeErr
|
||||||
|
}
|
||||||
|
if err := os.Remove(name); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
|
||||||
|
msg := strings.TrimSpace(raw)
|
||||||
|
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||||
|
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||||
|
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||||
|
}
|
||||||
|
if msg == "" {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%s: %w", msg, err)
|
||||||
|
}
|
||||||
52
audit/internal/app/blackbox_test.go
Normal file
52
audit/internal/app/blackbox_test.go
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
|
||||||
|
current := 2 * time.Second
|
||||||
|
got := adjustFlushPeriod(current, 4*time.Second, false, 0)
|
||||||
|
if got <= current {
|
||||||
|
t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
|
||||||
|
current := 10 * time.Second
|
||||||
|
got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
|
||||||
|
if got >= current {
|
||||||
|
t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
|
||||||
|
}
|
||||||
|
if got < blackboxMinFlushPeriod {
|
||||||
|
t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadBlackboxState(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "blackbox-state.json")
|
||||||
|
want := BlackboxState{
|
||||||
|
Status: "running",
|
||||||
|
BootStartedAtUTC: "2026-04-24T00:00:00Z",
|
||||||
|
BootFolder: "boot-folder",
|
||||||
|
UpdatedAtUTC: "2026-04-24T00:00:01Z",
|
||||||
|
Targets: []BlackboxTargetStatus{{
|
||||||
|
EnrollmentID: "bb-1",
|
||||||
|
Device: "/dev/sdb1",
|
||||||
|
Status: "running",
|
||||||
|
FlushPeriod: "1s",
|
||||||
|
}},
|
||||||
|
}
|
||||||
|
if err := writeJSONAtomic(path, want); err != nil {
|
||||||
|
t.Fatalf("writeJSONAtomic: %v", err)
|
||||||
|
}
|
||||||
|
got, err := ReadBlackboxState(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadBlackboxState: %v", err)
|
||||||
|
}
|
||||||
|
if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
|
||||||
|
t.Fatalf("state=%+v", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -15,6 +15,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var supportBundleServices = []string{
|
var supportBundleServices = []string{
|
||||||
|
"bee-blackbox.service",
|
||||||
"bee-audit.service",
|
"bee-audit.service",
|
||||||
"bee-web.service",
|
"bee-web.service",
|
||||||
"bee-network.service",
|
"bee-network.service",
|
||||||
@@ -256,11 +257,6 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
now := time.Now().UTC()
|
now := time.Now().UTC()
|
||||||
date := now.Format("2006-01-02")
|
|
||||||
tod := now.Format("150405")
|
|
||||||
ver := bundleVersion()
|
|
||||||
model := serverModelForBundle()
|
|
||||||
sn := serverSerialForBundle()
|
|
||||||
|
|
||||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||||
@@ -294,7 +290,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
archiveName := SupportBundleBaseName(now) + ".tar.gz"
|
||||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
@@ -302,6 +298,16 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return archivePath, nil
|
return archivePath, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SupportBundleBaseName(at time.Time) string {
|
||||||
|
at = at.UTC()
|
||||||
|
date := at.Format("2006-01-02")
|
||||||
|
tod := at.Format("150405")
|
||||||
|
ver := bundleVersion()
|
||||||
|
model := serverModelForBundle()
|
||||||
|
sn := serverSerialForBundle()
|
||||||
|
return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
|
||||||
|
}
|
||||||
|
|
||||||
func LatestSupportBundlePath() (string, error) {
|
func LatestSupportBundlePath() (string, error) {
|
||||||
return latestSupportBundlePath(os.TempDir())
|
return latestSupportBundlePath(os.TempDir())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package collector
|
|||||||
import (
|
import (
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
"bufio"
|
"bufio"
|
||||||
|
"context"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) {
|
|||||||
return string(out), nil
|
return string(out), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var execIpmitool = func(args ...string) (string, error) {
|
|
||||||
out, err := exec.Command("ipmitool", args...).Output()
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return string(out), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
||||||
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
||||||
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
||||||
@@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
|
|||||||
|
|
||||||
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
||||||
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
||||||
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
|
func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord {
|
||||||
if _, err := exec.LookPath("ipmitool"); err != nil {
|
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
out, err := execIpmitool("mc", "info")
|
profile := selectIPMIProfile(manufacturer)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout)
|
||||||
|
defer cancel()
|
||||||
|
cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info")
|
||||||
|
raw, err := cmd.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
version := parseBMCFirmwareRevision(out)
|
version := parseBMCFirmwareRevision(string(raw))
|
||||||
if version == "" {
|
if version == "" {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
|||||||
board, biosFW := collectBoard()
|
board, biosFW := collectBoard()
|
||||||
snap.Board = board
|
snap.Board = board
|
||||||
snap.Firmware = append(snap.Firmware, biosFW...)
|
snap.Firmware = append(snap.Firmware, biosFW...)
|
||||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
|
snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)
|
||||||
|
|
||||||
snap.CPUs = collectCPUs()
|
snap.CPUs = collectCPUs()
|
||||||
|
|
||||||
@@ -34,6 +34,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
|||||||
}
|
}
|
||||||
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
|
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
|
||||||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||||
|
bestEffortRescanHotplugStorage()
|
||||||
snap.Storage = collectStorage()
|
snap.Storage = collectStorage()
|
||||||
snap.PCIeDevices = collectPCIe()
|
snap.PCIeDevices = collectPCIe()
|
||||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||||
@@ -44,7 +45,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
|||||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||||||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||||||
snap.PowerSupplies = collectPSUs()
|
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||||||
|
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||||
finalizeSnapshot(&snap, collectedAt)
|
finalizeSnapshot(&snap, collectedAt)
|
||||||
|
|||||||
92
audit/internal/collector/ipmi_profile.go
Normal file
92
audit/internal/collector/ipmi_profile.go
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
// Package-level IPMI tuning profiles.
|
||||||
|
//
|
||||||
|
// Each profile is matched by board manufacturer (already known before PSU
|
||||||
|
// collection runs). The profile drives two things:
|
||||||
|
// - Per-command timeouts — prevents infinite hangs on slow BMCs.
|
||||||
|
// - FRU early-exit — streaming parser stops reading once all PSU entries
|
||||||
|
// are found, avoiding the tail of non-PSU FRU records.
|
||||||
|
//
|
||||||
|
// To add a new vendor: append to ipmiProfiles. The first matching entry wins.
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ipmiProfile holds tuning parameters for one or more board manufacturers.
|
||||||
|
type ipmiProfile struct {
|
||||||
|
// name is shown in log messages.
|
||||||
|
name string
|
||||||
|
// manufacturers is a list of lowercase substrings matched against the
|
||||||
|
// board manufacturer string from dmidecode type 1.
|
||||||
|
manufacturers []string
|
||||||
|
// fruTimeout is the hard deadline for the entire `ipmitool fru print`
|
||||||
|
// command. Zero means no timeout (not recommended).
|
||||||
|
fruTimeout time.Duration
|
||||||
|
// sdrTimeout is the hard deadline for `ipmitool sdr`.
|
||||||
|
sdrTimeout time.Duration
|
||||||
|
// mcInfoTimeout is the hard deadline for `ipmitool mc info`.
|
||||||
|
mcInfoTimeout time.Duration
|
||||||
|
// fruEarlyExit instructs the streaming FRU parser to stop reading
|
||||||
|
// after it has found at least one PSU entry and the current block is
|
||||||
|
// complete. Useful on servers with many non-PSU FRU devices.
|
||||||
|
fruEarlyExit bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// ipmiProfiles is the ordered list of profiles. First match wins.
|
||||||
|
var ipmiProfiles = []ipmiProfile{
|
||||||
|
{
|
||||||
|
// Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series).
|
||||||
|
// SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the
|
||||||
|
// full `fru print` scan takes ~108 s on a loaded BMC. Enable early
|
||||||
|
// exit so collection stops once PSU records are found.
|
||||||
|
name: "lenovo",
|
||||||
|
manufacturers: []string{"lenovo"},
|
||||||
|
fruTimeout: 90 * time.Second,
|
||||||
|
sdrTimeout: 45 * time.Second,
|
||||||
|
mcInfoTimeout: 15 * time.Second,
|
||||||
|
fruEarlyExit: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// HPE iLO-based servers (ProLiant DL/ML/BL).
|
||||||
|
name: "hpe",
|
||||||
|
manufacturers: []string{"hp", "hewlett packard"},
|
||||||
|
fruTimeout: 60 * time.Second,
|
||||||
|
sdrTimeout: 30 * time.Second,
|
||||||
|
mcInfoTimeout: 10 * time.Second,
|
||||||
|
fruEarlyExit: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Dell iDRAC-based servers.
|
||||||
|
name: "dell",
|
||||||
|
manufacturers: []string{"dell"},
|
||||||
|
fruTimeout: 60 * time.Second,
|
||||||
|
sdrTimeout: 30 * time.Second,
|
||||||
|
mcInfoTimeout: 10 * time.Second,
|
||||||
|
fruEarlyExit: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// defaultIPMIProfile is used when no vendor profile matches.
|
||||||
|
var defaultIPMIProfile = ipmiProfile{
|
||||||
|
name: "default",
|
||||||
|
fruTimeout: 60 * time.Second,
|
||||||
|
sdrTimeout: 30 * time.Second,
|
||||||
|
mcInfoTimeout: 10 * time.Second,
|
||||||
|
fruEarlyExit: false,
|
||||||
|
}
|
||||||
|
|
||||||
|
// selectIPMIProfile returns the profile for the given board manufacturer.
|
||||||
|
func selectIPMIProfile(manufacturer string) ipmiProfile {
|
||||||
|
mfgLower := strings.ToLower(strings.TrimSpace(manufacturer))
|
||||||
|
for _, p := range ipmiProfiles {
|
||||||
|
for _, m := range p.manufacturers {
|
||||||
|
if strings.Contains(mfgLower, m) {
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return defaultIPMIProfile
|
||||||
|
}
|
||||||
@@ -4,7 +4,9 @@ import (
|
|||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
@@ -140,6 +142,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||||
dev.NUMANode = &numaNode
|
dev.NUMANode = &numaNode
|
||||||
}
|
}
|
||||||
|
if group, ok := readPCIIOMMUGroup(bdf); ok {
|
||||||
|
dev.IOMMUGroup = &group
|
||||||
|
}
|
||||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||||
dev.LinkWidth = &width
|
dev.LinkWidth = &width
|
||||||
}
|
}
|
||||||
@@ -179,6 +184,21 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
return dev
|
return dev
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
|
||||||
|
// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
|
||||||
|
func readPCIIOMMUGroup(bdf string) (int, bool) {
|
||||||
|
link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
|
||||||
|
target, err := os.Readlink(link)
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(filepath.Base(target))
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return n, true
|
||||||
|
}
|
||||||
|
|
||||||
// readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
|
// readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
|
||||||
func readPCIIDs(bdf string) (vendorID, deviceID int) {
|
func readPCIIDs(bdf string) (vendorID, deviceID int) {
|
||||||
base := "/sys/bus/pci/devices/" + bdf
|
base := "/sys/bus/pci/devices/" + bdf
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ package collector
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
|
"bufio"
|
||||||
|
"context"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"regexp"
|
"regexp"
|
||||||
@@ -10,16 +12,29 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func collectPSUs() []schema.HardwarePowerSupply {
|
func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
|
||||||
|
profile := selectIPMIProfile(manufacturer)
|
||||||
|
|
||||||
var psus []schema.HardwarePowerSupply
|
var psus []schema.HardwarePowerSupply
|
||||||
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
|
fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
|
||||||
psus = parseFRU(string(out))
|
defer fruCancel()
|
||||||
|
|
||||||
|
if profile.fruEarlyExit {
|
||||||
|
psus = collectFRUEarlyExit(fruCtx)
|
||||||
} else {
|
} else {
|
||||||
slog.Info("psu: fru unavailable", "err", err)
|
cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
|
||||||
|
if out, err := cmd.Output(); err == nil {
|
||||||
|
psus = parseFRU(string(out))
|
||||||
|
} else {
|
||||||
|
slog.Info("psu: fru unavailable", "err", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sdrData := map[int]psuSDR{}
|
sdrData := map[int]psuSDR{}
|
||||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
|
||||||
|
defer sdrCancel()
|
||||||
|
cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
|
||||||
|
if sdrOut, err := cmd.Output(); err == nil {
|
||||||
sdrData = parsePSUSDR(string(sdrOut))
|
sdrData = parsePSUSDR(string(sdrOut))
|
||||||
if len(psus) == 0 {
|
if len(psus) == 0 {
|
||||||
psus = synthesizePSUsFromSDR(sdrData)
|
psus = synthesizePSUsFromSDR(sdrData)
|
||||||
@@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply {
|
|||||||
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
slog.Info("psu: collected", "count", len(psus))
|
slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
|
||||||
|
return psus
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
|
||||||
|
// as soon as it has found all PSU blocks and the next block is not a PSU.
|
||||||
|
// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
|
||||||
|
func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
|
||||||
|
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
|
||||||
|
pipe, err := cmd.StdoutPipe()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("psu: fru pipe unavailable", "err", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
slog.Info("psu: fru start failed", "err", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var psus []schema.HardwarePowerSupply
|
||||||
|
var currentBlock strings.Builder
|
||||||
|
slot := 0
|
||||||
|
psuFound := false
|
||||||
|
stoppedEarly := false
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(pipe)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
|
||||||
|
if strings.HasPrefix(line, "FRU Device Description") {
|
||||||
|
if currentBlock.Len() > 0 {
|
||||||
|
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||||
|
psus = append(psus, psu)
|
||||||
|
psuFound = true
|
||||||
|
slot++
|
||||||
|
}
|
||||||
|
currentBlock.Reset()
|
||||||
|
}
|
||||||
|
// Stop once we've collected PSUs and hit a non-PSU block header.
|
||||||
|
if psuFound && !isPSUHeader(strings.ToLower(line)) {
|
||||||
|
stoppedEarly = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
currentBlock.WriteString(line)
|
||||||
|
currentBlock.WriteByte('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
if !stoppedEarly && currentBlock.Len() > 0 {
|
||||||
|
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||||
|
psus = append(psus, psu)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Kill the process immediately on early exit rather than waiting for context timeout.
|
||||||
|
if cmd.Process != nil {
|
||||||
|
cmd.Process.Kill() //nolint:errcheck
|
||||||
|
}
|
||||||
|
cmd.Wait() //nolint:errcheck
|
||||||
|
slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
|
||||||
return psus
|
return psus
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray {
|
|||||||
return arrays
|
return arrays
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// collectVROCLicense runs mdadm --detail-platform and extracts the License field.
|
||||||
|
// Returns nil when VROC is absent or the platform does not report a license.
|
||||||
|
func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string {
|
||||||
|
if !hasVROCController(pcie) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out, err := raidToolQuery("mdadm", "--detail-platform")
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("vroc: mdadm --detail-platform unavailable", "err", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return parseMDAdmPlatformLicense(string(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseMDAdmPlatformLicense(raw string) *string {
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(strings.ToLower(trimmed), "license") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if idx := strings.Index(trimmed, ":"); idx >= 0 {
|
||||||
|
val := strings.TrimSpace(trimmed[idx+1:])
|
||||||
|
if val != "" {
|
||||||
|
v := strings.ToLower(val)
|
||||||
|
return &v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func queryDeviceSerial(devPath string) string {
|
func queryDeviceSerial(devPath string) string {
|
||||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||||
var ctrl nvmeIDCtrl
|
var ctrl nvmeIDCtrl
|
||||||
|
|||||||
@@ -4,12 +4,52 @@ import (
|
|||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
pciRescanPath = "/sys/bus/pci/rescan"
|
||||||
|
scsiHostScanGlob = "/sys/class/scsi_host/host*/scan"
|
||||||
|
hotplugWriteFile = os.WriteFile
|
||||||
|
hotplugExecCommand = exec.Command
|
||||||
|
hotplugGlob = filepath.Glob
|
||||||
|
nvmeLBAFCompactRE = regexp.MustCompile(`(?im)^\s*lbaf\s+\d+\s*:\s*ms:(\d+)\s+lbads:(\d+).*?\(in use\)\s*$`)
|
||||||
|
nvmeLBAFVerboseRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+\d+\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*?\(in use\)\s*$`)
|
||||||
|
sgReadcapBlockRE = regexp.MustCompile(`(?im)logical block length\s*=\s*(\d+)\s+bytes`)
|
||||||
|
sgReadcapProtRE = regexp.MustCompile(`(?im)prot_en\s*=\s*1`)
|
||||||
|
)
|
||||||
|
|
||||||
|
func bestEffortRescanHotplugStorage() {
|
||||||
|
if err := hotplugWriteFile(pciRescanPath, []byte("1\n"), 0644); err != nil {
|
||||||
|
slog.Info("storage: pci rescan skipped", "path", pciRescanPath, "err", err)
|
||||||
|
} else {
|
||||||
|
slog.Info("storage: triggered pci rescan for hotplug discovery")
|
||||||
|
}
|
||||||
|
|
||||||
|
hostPaths, err := hotplugGlob(scsiHostScanGlob)
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
||||||
|
} else {
|
||||||
|
for _, path := range hostPaths {
|
||||||
|
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
||||||
|
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
slog.Info("storage: triggered scsi host scan", "path", path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out, err := hotplugExecCommand("udevadm", "settle", "--timeout=10").CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("storage: udev settle after hotplug rescan failed", "err", err, "output", strings.TrimSpace(string(out)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func collectStorage() []schema.HardwareStorage {
|
func collectStorage() []schema.HardwareStorage {
|
||||||
devs := discoverStorageDevices()
|
devs := discoverStorageDevices()
|
||||||
result := make([]schema.HardwareStorage, 0, len(devs))
|
result := make([]schema.HardwareStorage, 0, len(devs))
|
||||||
@@ -35,6 +75,8 @@ type lsblkDevice struct {
|
|||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Tran string `json:"tran"`
|
Tran string `json:"tran"`
|
||||||
Hctl string `json:"hctl"`
|
Hctl string `json:"hctl"`
|
||||||
|
LogSec string `json:"log-sec"`
|
||||||
|
PhySec string `json:"phy-sec"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type lsblkRoot struct {
|
type lsblkRoot struct {
|
||||||
@@ -101,7 +143,7 @@ func isVirtualHDiskModel(model string) bool {
|
|||||||
|
|
||||||
func lsblkDevices() []lsblkDevice {
|
func lsblkDevices() []lsblkDevice {
|
||||||
out, err := exec.Command("lsblk", "-J", "-d",
|
out, err := exec.Command("lsblk", "-J", "-d",
|
||||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL,LOG-SEC,PHY-SEC").Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("storage: lsblk failed", "err", err)
|
slog.Warn("storage: lsblk failed", "err", err)
|
||||||
return nil
|
return nil
|
||||||
@@ -208,6 +250,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
present := true
|
present := true
|
||||||
s := schema.HardwareStorage{Present: &present}
|
s := schema.HardwareStorage{Present: &present}
|
||||||
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
||||||
|
applyStorageBlockGeometry(&s, dev)
|
||||||
|
|
||||||
tran := strings.ToLower(dev.Tran)
|
tran := strings.ToLower(dev.Tran)
|
||||||
devPath := "/dev/" + dev.Name
|
devPath := "/dev/" + dev.Name
|
||||||
@@ -250,6 +293,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var info smartctlInfo
|
var info smartctlInfo
|
||||||
|
var raw map[string]any
|
||||||
|
_ = json.Unmarshal(out, &raw)
|
||||||
if err := json.Unmarshal(out, &info); err == nil {
|
if err := json.Unmarshal(out, &info); err == nil {
|
||||||
if v := cleanDMIValue(info.ModelName); v != "" {
|
if v := cleanDMIValue(info.ModelName); v != "" {
|
||||||
s.Model = &v
|
s.Model = &v
|
||||||
@@ -302,8 +347,11 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
value := float64(attr.Raw.Value)
|
value := float64(attr.Raw.Value)
|
||||||
s.LifeRemainingPct = &value
|
s.LifeRemainingPct = &value
|
||||||
case 241:
|
case 241:
|
||||||
value := attr.Raw.Value
|
value := smartLBAsToBytes(attr.Raw.Value)
|
||||||
s.WrittenBytes = &value
|
s.WrittenBytes = &value
|
||||||
|
case 242:
|
||||||
|
value := smartLBAsToBytes(attr.Raw.Value)
|
||||||
|
s.ReadBytes = &value
|
||||||
case 197:
|
case 197:
|
||||||
pending = attr.Raw.Value
|
pending = attr.Raw.Value
|
||||||
s.CurrentPendingSectors = &pending
|
s.CurrentPendingSectors = &pending
|
||||||
@@ -321,6 +369,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
offlineUncorrectable: uncorrectable,
|
offlineUncorrectable: uncorrectable,
|
||||||
lifeRemainingPct: lifeRemaining,
|
lifeRemainingPct: lifeRemaining,
|
||||||
}
|
}
|
||||||
|
applySCSISmartctlTelemetry(&s, raw, &status)
|
||||||
|
applySCSIProtectionBlockGeometry(&s, devPath)
|
||||||
setStorageHealthStatus(&s, status)
|
setStorageHealthStatus(&s, status)
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
@@ -368,6 +418,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
Interface: &iface,
|
Interface: &iface,
|
||||||
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
||||||
}
|
}
|
||||||
|
applyStorageBlockGeometry(&s, dev)
|
||||||
|
|
||||||
devPath := "/dev/" + dev.Name
|
devPath := "/dev/" + dev.Name
|
||||||
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
|
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
|
||||||
@@ -402,6 +453,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
applyNVMeBlockGeometry(&s, devPath)
|
||||||
|
|
||||||
// smart-log: wear telemetry
|
// smart-log: wear telemetry
|
||||||
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
||||||
@@ -477,6 +529,251 @@ func nvmeDataUnitsToBytes(units int64) int64 {
|
|||||||
return units * 512000
|
return units * 512000
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func smartLBAsToBytes(lbas int64) int64 {
|
||||||
|
if lbas <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return lbas * 512
|
||||||
|
}
|
||||||
|
|
||||||
|
func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, status *storageHealthStatus) {
|
||||||
|
if s == nil || len(raw) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if v, ok := firstInt64(raw,
|
||||||
|
"path:power_on_time.hours",
|
||||||
|
"path:accumulated_power_on_time.hours",
|
||||||
|
"path:power_on_time.hour",
|
||||||
|
"path:accumulated_power_on_time.hour",
|
||||||
|
); ok && v > 0 && s.PowerOnHours == nil {
|
||||||
|
s.PowerOnHours = &v
|
||||||
|
}
|
||||||
|
if v, ok := firstInt64(raw,
|
||||||
|
"path:power_cycle_count",
|
||||||
|
"path:start_stop_cycle_count",
|
||||||
|
"path:accumulated_start_stop_cycles",
|
||||||
|
); ok && v > 0 && s.PowerCycles == nil {
|
||||||
|
s.PowerCycles = &v
|
||||||
|
}
|
||||||
|
if v, ok := firstInt64(raw,
|
||||||
|
"path:scsi_grown_defect_list",
|
||||||
|
"path:grown_defect_list",
|
||||||
|
); ok && v > 0 && s.ReallocatedSectors == nil {
|
||||||
|
s.ReallocatedSectors = &v
|
||||||
|
if status != nil && status.reallocatedSectors == 0 {
|
||||||
|
status.reallocatedSectors = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if v, ok := firstInt64(raw,
|
||||||
|
"path:percentage_used_endurance_indicator",
|
||||||
|
"path:scsi_percentage_used_endurance_indicator",
|
||||||
|
); ok && v > 0 {
|
||||||
|
if s.LifeUsedPct == nil {
|
||||||
|
fv := float64(v)
|
||||||
|
s.LifeUsedPct = &fv
|
||||||
|
}
|
||||||
|
if s.LifeRemainingPct == nil && v <= 100 {
|
||||||
|
remaining := float64(100 - v)
|
||||||
|
s.LifeRemainingPct = &remaining
|
||||||
|
if status != nil && status.lifeRemainingPct == 0 {
|
||||||
|
status.lifeRemainingPct = int64(remaining)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
blockSize, hasBlockSize := firstInt64(raw,
|
||||||
|
"path:logical_block_size",
|
||||||
|
"path:block_size",
|
||||||
|
"path:user_capacity.block_size",
|
||||||
|
)
|
||||||
|
if hasBlockSize && blockSize > 0 {
|
||||||
|
if s.LogicalBlockSizeBytes == nil {
|
||||||
|
s.LogicalBlockSizeBytes = &blockSize
|
||||||
|
}
|
||||||
|
if s.MetadataBytesPerBlock == nil {
|
||||||
|
zero := int64(0)
|
||||||
|
s.MetadataBytesPerBlock = &zero
|
||||||
|
}
|
||||||
|
if s.Telemetry == nil {
|
||||||
|
s.Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
s.Telemetry["logical_block_size_bytes"] = *s.LogicalBlockSizeBytes
|
||||||
|
s.Telemetry["metadata_bytes_per_block"] = *s.MetadataBytesPerBlock
|
||||||
|
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
|
||||||
|
if v, ok := firstInt64(raw,
|
||||||
|
"path:logical_blocks_written",
|
||||||
|
"path:total_lbas_written",
|
||||||
|
); ok && v > 0 && s.WrittenBytes == nil {
|
||||||
|
bytes := v * blockSize
|
||||||
|
s.WrittenBytes = &bytes
|
||||||
|
}
|
||||||
|
if v, ok := firstInt64(raw,
|
||||||
|
"path:logical_blocks_read",
|
||||||
|
"path:total_lbas_read",
|
||||||
|
); ok && v > 0 && s.ReadBytes == nil {
|
||||||
|
bytes := v * blockSize
|
||||||
|
s.ReadBytes = &bytes
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
logical := parseStorageBytes(dev.LogSec)
|
||||||
|
physical := parseStorageBytes(dev.PhySec)
|
||||||
|
if logical <= 0 && physical <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.Telemetry == nil {
|
||||||
|
s.Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
if logical > 0 {
|
||||||
|
s.LogicalBlockSizeBytes = &logical
|
||||||
|
s.Telemetry["logical_block_size_bytes"] = logical
|
||||||
|
if s.MetadataBytesPerBlock == nil {
|
||||||
|
zero := int64(0)
|
||||||
|
s.MetadataBytesPerBlock = &zero
|
||||||
|
s.Telemetry["metadata_bytes_per_block"] = zero
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if physical > 0 {
|
||||||
|
s.PhysicalBlockSizeBytes = &physical
|
||||||
|
s.Telemetry["physical_block_size_bytes"] = physical
|
||||||
|
}
|
||||||
|
if s.LogicalBlockSizeBytes != nil && s.MetadataBytesPerBlock != nil {
|
||||||
|
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyNVMeBlockGeometry(s *schema.HardwareStorage, devPath string) {
|
||||||
|
if s == nil || strings.TrimSpace(devPath) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
out, err := exec.Command("nvme", "id-ns", devPath, "-H").CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(string(out))
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
setStorageBlockGeometry(s, dataBytes, metadataBytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func applySCSIProtectionBlockGeometry(s *schema.HardwareStorage, devPath string) {
|
||||||
|
if s == nil || strings.TrimSpace(devPath) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
out, err := exec.Command("sg_readcap", "-l", devPath).CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(string(out))
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
setStorageBlockGeometry(s, dataBytes, metadataBytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func setStorageBlockGeometry(s *schema.HardwareStorage, dataBytes, metadataBytes int64) {
|
||||||
|
if s == nil || dataBytes <= 0 || metadataBytes < 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.Telemetry == nil {
|
||||||
|
s.Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
s.LogicalBlockSizeBytes = &dataBytes
|
||||||
|
s.MetadataBytesPerBlock = &metadataBytes
|
||||||
|
s.Telemetry["logical_block_size_bytes"] = dataBytes
|
||||||
|
s.Telemetry["metadata_bytes_per_block"] = metadataBytes
|
||||||
|
s.Telemetry["block_format"] = formatBlockFormat(dataBytes, metadataBytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatBlockFormat(dataBytes, metadataBytes int64) string {
|
||||||
|
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseNVMeBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
|
||||||
|
if m := nvmeLBAFCompactRE.FindStringSubmatch(raw); len(m) == 3 {
|
||||||
|
ms, errMS := strconv.ParseInt(m[1], 10, 64)
|
||||||
|
lbads, errLBADS := strconv.ParseInt(m[2], 10, 64)
|
||||||
|
if errMS == nil && errLBADS == nil && lbads >= 0 && lbads < 63 {
|
||||||
|
return 1 << lbads, ms, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if m := nvmeLBAFVerboseRE.FindStringSubmatch(raw); len(m) == 3 {
|
||||||
|
ms, errMS := strconv.ParseInt(m[1], 10, 64)
|
||||||
|
ds, errDS := strconv.ParseInt(m[2], 10, 64)
|
||||||
|
if errMS == nil && errDS == nil && ds > 0 {
|
||||||
|
return ds, ms, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSCSIBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
|
||||||
|
m := sgReadcapBlockRE.FindStringSubmatch(raw)
|
||||||
|
if len(m) != 2 {
|
||||||
|
return 0, 0, false
|
||||||
|
}
|
||||||
|
blockBytes, err := strconv.ParseInt(m[1], 10, 64)
|
||||||
|
if err != nil || blockBytes <= 0 {
|
||||||
|
return 0, 0, false
|
||||||
|
}
|
||||||
|
if sgReadcapProtRE.MatchString(raw) {
|
||||||
|
return blockBytes, 8, true
|
||||||
|
}
|
||||||
|
return blockBytes, 0, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstInt64(root map[string]any, candidates ...string) (int64, bool) {
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if !strings.HasPrefix(candidate, "path:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := strings.TrimPrefix(candidate, "path:")
|
||||||
|
if v, ok := nestedInt64(root, strings.Split(path, ".")); ok {
|
||||||
|
return v, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func nestedInt64(root map[string]any, path []string) (int64, bool) {
|
||||||
|
var current any = root
|
||||||
|
for _, key := range path {
|
||||||
|
obj, ok := current.(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
current, ok = obj[key]
|
||||||
|
if !ok {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
switch v := current.(type) {
|
||||||
|
case float64:
|
||||||
|
return int64(v), true
|
||||||
|
case float32:
|
||||||
|
return int64(v), true
|
||||||
|
case int:
|
||||||
|
return int64(v), true
|
||||||
|
case int64:
|
||||||
|
return v, true
|
||||||
|
case int32:
|
||||||
|
return int64(v), true
|
||||||
|
case json.Number:
|
||||||
|
n, err := v.Int64()
|
||||||
|
return n, err == nil
|
||||||
|
case string:
|
||||||
|
n, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64)
|
||||||
|
return n, err == nil
|
||||||
|
default:
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type storageHealthStatus struct {
|
type storageHealthStatus struct {
|
||||||
hasOverall bool
|
hasOverall bool
|
||||||
overallPassed bool
|
overallPassed bool
|
||||||
|
|||||||
69
audit/internal/collector/storage_block_format_test.go
Normal file
69
audit/internal/collector/storage_block_format_test.go
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestParseNVMeBlockFormatCompact(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
raw := `
|
||||||
|
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||||
|
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||||
|
`
|
||||||
|
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("parseNVMeBlockFormat returned ok=false")
|
||||||
|
}
|
||||||
|
if dataBytes != 512 || metadataBytes != 0 {
|
||||||
|
t.Fatalf("got %d+%d want 512+0", dataBytes, metadataBytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVMeBlockFormatVerbose(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
raw := `
|
||||||
|
LBA Format 0 : Metadata Size: 8 bytes - Data Size: 512 bytes - Relative Performance: 0 Better (in use)
|
||||||
|
LBA Format 1 : Metadata Size: 0 bytes - Data Size: 4096 bytes - Relative Performance: 1 Best
|
||||||
|
`
|
||||||
|
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("parseNVMeBlockFormat returned ok=false")
|
||||||
|
}
|
||||||
|
if dataBytes != 512 || metadataBytes != 8 {
|
||||||
|
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseSCSIBlockFormatWithProtection(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
raw := `
|
||||||
|
Read Capacity results:
|
||||||
|
Protection: prot_en=1, p_type=1, p_i_exponent=0
|
||||||
|
Logical block length=512 bytes
|
||||||
|
`
|
||||||
|
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("parseSCSIBlockFormat returned ok=false")
|
||||||
|
}
|
||||||
|
if dataBytes != 512 || metadataBytes != 8 {
|
||||||
|
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseSCSIBlockFormatWithoutProtection(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
raw := `
|
||||||
|
Read Capacity results:
|
||||||
|
Protection: prot_en=0, p_type=0, p_i_exponent=0
|
||||||
|
Logical block length=4096 bytes
|
||||||
|
`
|
||||||
|
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("parseSCSIBlockFormat returned ok=false")
|
||||||
|
}
|
||||||
|
if dataBytes != 4096 || metadataBytes != 0 {
|
||||||
|
t.Fatalf("got %d+%d want 4096+0", dataBytes, metadataBytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,12 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
|
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
@@ -31,3 +37,82 @@ func TestParseStorageBytes(t *testing.T) {
|
|||||||
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
|
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tmp := t.TempDir()
|
||||||
|
rescanPath := filepath.Join(tmp, "pci-rescan")
|
||||||
|
scanDir := filepath.Join(tmp, "scsi_host")
|
||||||
|
host0Path := filepath.Join(scanDir, "host0", "scan")
|
||||||
|
host1Path := filepath.Join(scanDir, "host1", "scan")
|
||||||
|
argsPath := filepath.Join(tmp, "udevadm-args")
|
||||||
|
toolPath := filepath.Join(tmp, "udevadm")
|
||||||
|
if err := os.MkdirAll(filepath.Dir(host0Path), 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir host0: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(host1Path), 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir host1: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(host0Path, nil, 0644); err != nil {
|
||||||
|
t.Fatalf("touch host0 scan: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(host1Path, nil, 0644); err != nil {
|
||||||
|
t.Fatalf("touch host1 scan: %v", err)
|
||||||
|
}
|
||||||
|
script := "#!/bin/sh\nprintf '%s' \"$*\" > \"" + argsPath + "\"\n"
|
||||||
|
if err := os.WriteFile(toolPath, []byte(script), 0755); err != nil {
|
||||||
|
t.Fatalf("write udevadm stub: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
oldPath := os.Getenv("PATH")
|
||||||
|
if err := os.Setenv("PATH", tmp+string(os.PathListSeparator)+oldPath); err != nil {
|
||||||
|
t.Fatalf("set PATH: %v", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = os.Setenv("PATH", oldPath) }()
|
||||||
|
|
||||||
|
oldRescanPath := pciRescanPath
|
||||||
|
oldSCSIGlob := scsiHostScanGlob
|
||||||
|
oldWriteFile := hotplugWriteFile
|
||||||
|
oldExecCommand := hotplugExecCommand
|
||||||
|
oldGlob := hotplugGlob
|
||||||
|
pciRescanPath = rescanPath
|
||||||
|
scsiHostScanGlob = filepath.Join(scanDir, "host*", "scan")
|
||||||
|
hotplugWriteFile = os.WriteFile
|
||||||
|
hotplugExecCommand = exec.Command
|
||||||
|
hotplugGlob = filepath.Glob
|
||||||
|
defer func() {
|
||||||
|
pciRescanPath = oldRescanPath
|
||||||
|
scsiHostScanGlob = oldSCSIGlob
|
||||||
|
hotplugWriteFile = oldWriteFile
|
||||||
|
hotplugExecCommand = oldExecCommand
|
||||||
|
hotplugGlob = oldGlob
|
||||||
|
}()
|
||||||
|
|
||||||
|
bestEffortRescanHotplugStorage()
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(rescanPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read rescan file: %v", err)
|
||||||
|
}
|
||||||
|
if string(raw) != "1\n" {
|
||||||
|
t.Fatalf("rescan payload=%q want %q", string(raw), "1\n")
|
||||||
|
}
|
||||||
|
for _, path := range []string{host0Path, host1Path} {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read scsi scan file %s: %v", path, err)
|
||||||
|
}
|
||||||
|
if string(raw) != "- - -\n" {
|
||||||
|
t.Fatalf("scsi scan payload at %s =%q want %q", path, string(raw), "- - -\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
args, err := os.ReadFile(argsPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read udevadm args: %v", err)
|
||||||
|
}
|
||||||
|
if got := strings.TrimSpace(string(args)); got != "settle --timeout=10" {
|
||||||
|
t.Fatalf("udevadm args=%q want %q", got, "settle --timeout=10")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
101
audit/internal/collector/storage_scsi_test.go
Normal file
101
audit/internal/collector/storage_scsi_test.go
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestApplySCSISmartctlTelemetry(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
raw := map[string]any{
|
||||||
|
"power_on_time": map[string]any{
|
||||||
|
"hours": float64(32123),
|
||||||
|
},
|
||||||
|
"accumulated_start_stop_cycles": float64(17),
|
||||||
|
"scsi_grown_defect_list": float64(4),
|
||||||
|
"percentage_used_endurance_indicator": float64(12),
|
||||||
|
"logical_block_size": float64(4096),
|
||||||
|
"logical_blocks_written": float64(1000),
|
||||||
|
"logical_blocks_read": float64(2000),
|
||||||
|
}
|
||||||
|
|
||||||
|
var disk schema.HardwareStorage
|
||||||
|
status := storageHealthStatus{}
|
||||||
|
applySCSISmartctlTelemetry(&disk, raw, &status)
|
||||||
|
|
||||||
|
if disk.PowerOnHours == nil || *disk.PowerOnHours != 32123 {
|
||||||
|
t.Fatalf("power_on_hours=%v want 32123", disk.PowerOnHours)
|
||||||
|
}
|
||||||
|
if disk.PowerCycles == nil || *disk.PowerCycles != 17 {
|
||||||
|
t.Fatalf("power_cycles=%v want 17", disk.PowerCycles)
|
||||||
|
}
|
||||||
|
if disk.ReallocatedSectors == nil || *disk.ReallocatedSectors != 4 {
|
||||||
|
t.Fatalf("reallocated=%v want 4", disk.ReallocatedSectors)
|
||||||
|
}
|
||||||
|
if disk.WrittenBytes == nil || *disk.WrittenBytes != 4096000 {
|
||||||
|
t.Fatalf("written_bytes=%v want 4096000", disk.WrittenBytes)
|
||||||
|
}
|
||||||
|
if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 {
|
||||||
|
t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes)
|
||||||
|
}
|
||||||
|
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 4096 {
|
||||||
|
t.Fatalf("logical_block_size_bytes=%v want 4096", disk.LogicalBlockSizeBytes)
|
||||||
|
}
|
||||||
|
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
|
||||||
|
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
|
||||||
|
}
|
||||||
|
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 {
|
||||||
|
t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct)
|
||||||
|
}
|
||||||
|
if disk.LifeRemainingPct == nil || *disk.LifeRemainingPct != 88 {
|
||||||
|
t.Fatalf("life_remaining_pct=%v want 88", disk.LifeRemainingPct)
|
||||||
|
}
|
||||||
|
if status.reallocatedSectors != 4 {
|
||||||
|
t.Fatalf("status.reallocated=%d want 4", status.reallocatedSectors)
|
||||||
|
}
|
||||||
|
if status.lifeRemainingPct != 88 {
|
||||||
|
t.Fatalf("status.life_remaining_pct=%d want 88", status.lifeRemainingPct)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplySCSISmartctlTelemetryDoesNotOverwriteExistingValues(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
powerOnHours := int64(10)
|
||||||
|
writtenBytes := int64(20)
|
||||||
|
lifeRemaining := 30.0
|
||||||
|
disk := schema.HardwareStorage{
|
||||||
|
PowerOnHours: &powerOnHours,
|
||||||
|
WrittenBytes: &writtenBytes,
|
||||||
|
LifeRemainingPct: &lifeRemaining,
|
||||||
|
}
|
||||||
|
raw := map[string]any{
|
||||||
|
"power_on_time": map[string]any{"hours": float64(999)},
|
||||||
|
"logical_block_size": float64(512),
|
||||||
|
"logical_blocks_written": float64(999),
|
||||||
|
"percentage_used_endurance_indicator": float64(50),
|
||||||
|
}
|
||||||
|
|
||||||
|
applySCSISmartctlTelemetry(&disk, raw, nil)
|
||||||
|
|
||||||
|
if *disk.PowerOnHours != 10 {
|
||||||
|
t.Fatalf("power_on_hours overwritten: got %d want 10", *disk.PowerOnHours)
|
||||||
|
}
|
||||||
|
if *disk.WrittenBytes != 20 {
|
||||||
|
t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes)
|
||||||
|
}
|
||||||
|
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 512 {
|
||||||
|
t.Fatalf("logical_block_size_bytes=%v want 512", disk.LogicalBlockSizeBytes)
|
||||||
|
}
|
||||||
|
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
|
||||||
|
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
|
||||||
|
}
|
||||||
|
if *disk.LifeRemainingPct != 30 {
|
||||||
|
t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct)
|
||||||
|
}
|
||||||
|
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 50 {
|
||||||
|
t.Fatalf("life_used_pct=%v want 50", disk.LifeUsedPct)
|
||||||
|
}
|
||||||
|
}
|
||||||
25
audit/internal/collector/storage_telemetry_test.go
Normal file
25
audit/internal/collector/storage_telemetry_test.go
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestSmartLBAsToBytes(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
lbas int64
|
||||||
|
want int64
|
||||||
|
}{
|
||||||
|
{name: "zero", lbas: 0, want: 0},
|
||||||
|
{name: "single lba", lbas: 1, want: 512},
|
||||||
|
{name: "multiple lbas", lbas: 2048, want: 1048576},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := smartLBAsToBytes(tt.lbas); got != tt.want {
|
||||||
|
t.Fatalf("smartLBAsToBytes(%d)=%d want %d", tt.lbas, got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseMDAdmPlatformLicense(t *testing.T) {
|
||||||
|
premium := `Platform : Intel(R) Virtual RAID on CPU
|
||||||
|
Version : 1.3.0.1138
|
||||||
|
RAID Levels : raid0 raid1 raid5 raid10
|
||||||
|
Total Disks : 4
|
||||||
|
License : Premium
|
||||||
|
`
|
||||||
|
got := parseMDAdmPlatformLicense(premium)
|
||||||
|
if got == nil || *got != "premium" {
|
||||||
|
t.Fatalf("expected 'premium', got %v", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
standard := `Platform : Intel(R) Virtual RAID on CPU
|
||||||
|
License : Standard
|
||||||
|
`
|
||||||
|
got = parseMDAdmPlatformLicense(standard)
|
||||||
|
if got == nil || *got != "standard" {
|
||||||
|
t.Fatalf("expected 'standard', got %v", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
noLicense := `Platform : Intel(R) Virtual RAID on CPU
|
||||||
|
Version : 1.0.0
|
||||||
|
`
|
||||||
|
got = parseMDAdmPlatformLicense(noLicense)
|
||||||
|
if got != nil {
|
||||||
|
t.Fatalf("expected nil, got %v", *got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHasVROCController(t *testing.T) {
|
func TestHasVROCController(t *testing.T) {
|
||||||
intel := vendorIntel
|
intel := vendorIntel
|
||||||
model := "Volume Management Device NVMe RAID Controller"
|
model := "Volume Management Device NVMe RAID Controller"
|
||||||
|
|||||||
@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
|
|||||||
VBIOS string
|
VBIOS string
|
||||||
PowerLimitW float64
|
PowerLimitW float64
|
||||||
DefaultPowerLimitW float64
|
DefaultPowerLimitW float64
|
||||||
|
MinPowerLimitW float64
|
||||||
|
MaxPowerLimitW float64
|
||||||
MaxGraphicsClockMHz float64
|
MaxGraphicsClockMHz float64
|
||||||
MaxMemoryClockMHz float64
|
MaxMemoryClockMHz float64
|
||||||
BaseGraphicsClockMHz float64
|
BaseGraphicsClockMHz float64
|
||||||
@@ -65,6 +67,13 @@ type benchmarkPowerCalibrationResult struct {
|
|||||||
MetricRows []GPUMetricRow
|
MetricRows []GPUMetricRow
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type benchmarkPowerCalibrationRunSummary struct {
|
||||||
|
LoadedSDR benchmarkSDRSeriesSummary
|
||||||
|
AvgFanRPM float64
|
||||||
|
AvgFanDutyCyclePct float64
|
||||||
|
FanSamples int
|
||||||
|
}
|
||||||
|
|
||||||
type benchmarkBurnProfile struct {
|
type benchmarkBurnProfile struct {
|
||||||
name string
|
name string
|
||||||
category string
|
category string
|
||||||
@@ -95,6 +104,9 @@ var (
|
|||||||
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
||||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||||
|
benchmarkGeteuid = os.Geteuid
|
||||||
|
benchmarkResetNvidiaGPU = resetNvidiaGPU
|
||||||
|
benchmarkSleep = time.Sleep
|
||||||
)
|
)
|
||||||
|
|
||||||
// benchmarkPrecisionPhases lists the precision categories run as individual
|
// benchmarkPrecisionPhases lists the precision categories run as individual
|
||||||
@@ -220,8 +232,6 @@ func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters)
|
|||||||
return "hw_thermal"
|
return "hw_thermal"
|
||||||
case diff.SWThermalSlowdownUS > 0:
|
case diff.SWThermalSlowdownUS > 0:
|
||||||
return "sw_thermal"
|
return "sw_thermal"
|
||||||
case diff.HWPowerBrakeSlowdownUS > 0:
|
|
||||||
return "hw_power_brake"
|
|
||||||
default:
|
default:
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
@@ -240,6 +250,67 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
|
||||||
|
}
|
||||||
|
out, err := benchmarkResetNvidiaGPU(gpuIndex)
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||||
|
"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
|
||||||
|
)
|
||||||
|
if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
|
||||||
|
for _, line := range strings.Split(trimmed, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line != "" {
|
||||||
|
logFunc(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rc := 0
|
||||||
|
if err != nil {
|
||||||
|
rc = 1
|
||||||
|
}
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||||
|
fmt.Sprintf("rc: %d", rc),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||||
|
if len(gpuIndices) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if benchmarkGeteuid() != 0 {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
|
||||||
|
}
|
||||||
|
return append([]int(nil), gpuIndices...)
|
||||||
|
}
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var failed []int
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
|
||||||
|
failed = append(failed, idx)
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
|
||||||
|
}
|
||||||
|
benchmarkSleep(time.Second)
|
||||||
|
}
|
||||||
|
return failed
|
||||||
|
}
|
||||||
|
|
||||||
func benchmarkPowerEngine() string {
|
func benchmarkPowerEngine() string {
|
||||||
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
||||||
case BenchmarkPowerEngineTargetedPower:
|
case BenchmarkPowerEngineTargetedPower:
|
||||||
@@ -351,9 +422,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
||||||
result.Normalization.Status = "partial"
|
result.Normalization.Status = "partial"
|
||||||
}
|
}
|
||||||
// Enrich with max clocks from verbose output — covers GPUs where
|
// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
|
||||||
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
|
// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
|
||||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)
|
||||||
|
|
||||||
activeApps, err := queryActiveComputeApps(selected)
|
activeApps, err := queryActiveComputeApps(selected)
|
||||||
if err == nil && len(activeApps) > 0 {
|
if err == nil && len(activeApps) > 0 {
|
||||||
@@ -737,8 +808,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
|||||||
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||||||
// all driver versions, so we fall back to the base set if the full query fails.
|
// all driver versions, so we fall back to the base set if the full query fails.
|
||||||
// The minimal fallback omits clock fields entirely — clocks.max.* returns
|
// The minimal fallback omits clock fields entirely — clocks.max.* returns
|
||||||
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
|
// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
|
||||||
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
|
// then recovered from nvidia-smi -q.
|
||||||
var benchmarkGPUInfoQueries = []struct {
|
var benchmarkGPUInfoQueries = []struct {
|
||||||
fields string
|
fields string
|
||||||
extended bool // whether this query includes optional extended fields
|
extended bool // whether this query includes optional extended fields
|
||||||
@@ -758,12 +829,9 @@ var benchmarkGPUInfoQueries = []struct {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
|
// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
|
||||||
// any GPU in infoByIndex where those values are still zero. It parses the
|
// for fields that may be missing from --query-gpu on some driver versions.
|
||||||
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
|
func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
||||||
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
|
|
||||||
// return exit status 2 but the verbose query works fine.
|
|
||||||
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
|
||||||
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
|
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -784,6 +852,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
|||||||
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
||||||
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
|
minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
|
maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
||||||
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
|
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
|
||||||
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
|
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
|
||||||
@@ -843,6 +913,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if info.MinPowerLimitW == 0 {
|
||||||
|
if m := minPwrRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||||
|
info.MinPowerLimitW = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if info.MaxPowerLimitW == 0 {
|
||||||
|
if m := maxPwrRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||||
|
info.MaxPowerLimitW = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if info.MultiprocessorCount == 0 {
|
if info.MultiprocessorCount == 0 {
|
||||||
if m := smCountRe.FindSubmatch(section); m != nil {
|
if m := smCountRe.FindSubmatch(section); m != nil {
|
||||||
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
|
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
|
||||||
@@ -2365,6 +2449,16 @@ type sdrPowerSnapshot struct {
|
|||||||
SkippedSensors []string // sensors rejected during self-healing
|
SkippedSensors []string // sensors rejected during self-healing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type benchmarkSDRSeriesSummary struct {
|
||||||
|
PSUInW float64
|
||||||
|
PSUOutW float64
|
||||||
|
GPUSlotW float64
|
||||||
|
PSUSlots map[string]BenchmarkPSUSlotPower
|
||||||
|
Samples int
|
||||||
|
|
||||||
|
SkippedSensors []string
|
||||||
|
}
|
||||||
|
|
||||||
// sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
|
// sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
|
||||||
type sdrSensor struct {
|
type sdrSensor struct {
|
||||||
name string
|
name string
|
||||||
@@ -2494,6 +2588,137 @@ func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
|
|||||||
return snap
|
return snap
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func startIPMISDRSampler(stopCh <-chan struct{}, intervalSec int) <-chan []sdrPowerSnapshot {
|
||||||
|
if intervalSec <= 0 {
|
||||||
|
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||||
|
}
|
||||||
|
ch := make(chan []sdrPowerSnapshot, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ch)
|
||||||
|
var samples []sdrPowerSnapshot
|
||||||
|
record := func() {
|
||||||
|
snap := sampleIPMISDRPowerSensors()
|
||||||
|
if snap.PSUInW <= 0 && snap.PSUOutW <= 0 && snap.GPUSlotW <= 0 && len(snap.PSUSlots) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
samples = append(samples, snap)
|
||||||
|
}
|
||||||
|
record()
|
||||||
|
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
ch <- samples
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
record()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
|
||||||
|
func summarizeSDRPowerSeries(samples []sdrPowerSnapshot) benchmarkSDRSeriesSummary {
|
||||||
|
var summary benchmarkSDRSeriesSummary
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return summary
|
||||||
|
}
|
||||||
|
|
||||||
|
type slotAggregate struct {
|
||||||
|
inputs []float64
|
||||||
|
outputs []float64
|
||||||
|
status string
|
||||||
|
}
|
||||||
|
|
||||||
|
slotAgg := make(map[string]*slotAggregate)
|
||||||
|
skippedSet := make(map[string]struct{})
|
||||||
|
var inputTotals []float64
|
||||||
|
var outputTotals []float64
|
||||||
|
var gpuSlotTotals []float64
|
||||||
|
|
||||||
|
for _, sample := range samples {
|
||||||
|
if sample.PSUInW > 0 {
|
||||||
|
inputTotals = append(inputTotals, sample.PSUInW)
|
||||||
|
}
|
||||||
|
if sample.PSUOutW > 0 {
|
||||||
|
outputTotals = append(outputTotals, sample.PSUOutW)
|
||||||
|
}
|
||||||
|
if sample.GPUSlotW > 0 {
|
||||||
|
gpuSlotTotals = append(gpuSlotTotals, sample.GPUSlotW)
|
||||||
|
}
|
||||||
|
for _, skipped := range sample.SkippedSensors {
|
||||||
|
if skipped != "" {
|
||||||
|
skippedSet[skipped] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for slot, reading := range sample.PSUSlots {
|
||||||
|
agg := slotAgg[slot]
|
||||||
|
if agg == nil {
|
||||||
|
agg = &slotAggregate{}
|
||||||
|
slotAgg[slot] = agg
|
||||||
|
}
|
||||||
|
if reading.InputW != nil && *reading.InputW > 0 {
|
||||||
|
agg.inputs = append(agg.inputs, *reading.InputW)
|
||||||
|
}
|
||||||
|
if reading.OutputW != nil && *reading.OutputW > 0 {
|
||||||
|
agg.outputs = append(agg.outputs, *reading.OutputW)
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case reading.Status == "":
|
||||||
|
case agg.status == "":
|
||||||
|
agg.status = reading.Status
|
||||||
|
case agg.status == "OK" && reading.Status != "OK":
|
||||||
|
agg.status = reading.Status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
summary.PSUInW = benchmarkMean(inputTotals)
|
||||||
|
summary.PSUOutW = benchmarkMean(outputTotals)
|
||||||
|
summary.GPUSlotW = benchmarkMean(gpuSlotTotals)
|
||||||
|
summary.Samples = len(samples)
|
||||||
|
|
||||||
|
if len(slotAgg) > 0 {
|
||||||
|
summary.PSUSlots = make(map[string]BenchmarkPSUSlotPower, len(slotAgg))
|
||||||
|
for slot, agg := range slotAgg {
|
||||||
|
reading := BenchmarkPSUSlotPower{Status: agg.status}
|
||||||
|
if mean := benchmarkMean(agg.inputs); mean > 0 {
|
||||||
|
v := mean
|
||||||
|
reading.InputW = &v
|
||||||
|
}
|
||||||
|
if mean := benchmarkMean(agg.outputs); mean > 0 {
|
||||||
|
v := mean
|
||||||
|
reading.OutputW = &v
|
||||||
|
}
|
||||||
|
summary.PSUSlots[slot] = reading
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(skippedSet) > 0 {
|
||||||
|
summary.SkippedSensors = make([]string, 0, len(skippedSet))
|
||||||
|
for skipped := range skippedSet {
|
||||||
|
summary.SkippedSensors = append(summary.SkippedSensors, skipped)
|
||||||
|
}
|
||||||
|
sort.Strings(summary.SkippedSensors)
|
||||||
|
}
|
||||||
|
|
||||||
|
return summary
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectIPMISDRPowerSeries(ctx context.Context, durationSec, intervalSec int) benchmarkSDRSeriesSummary {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return benchmarkSDRSeriesSummary{}
|
||||||
|
}
|
||||||
|
stopCh := make(chan struct{})
|
||||||
|
doneCh := startIPMISDRSampler(stopCh, intervalSec)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
case <-time.After(time.Duration(durationSec) * time.Second):
|
||||||
|
}
|
||||||
|
close(stopCh)
|
||||||
|
return summarizeSDRPowerSeries(<-doneCh)
|
||||||
|
}
|
||||||
|
|
||||||
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||||||
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||||||
func queryIPMIServerPowerW() (float64, error) {
|
func queryIPMIServerPowerW() (float64, error) {
|
||||||
@@ -3038,12 +3263,12 @@ func runBenchmarkPowerCalibration(
|
|||||||
logFunc func(string),
|
logFunc func(string),
|
||||||
seedLimits map[int]int,
|
seedLimits map[int]int,
|
||||||
durationSec int,
|
durationSec int,
|
||||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
|
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow, benchmarkPowerCalibrationRunSummary) {
|
||||||
calibDurationSec := durationSec
|
calibDurationSec := durationSec
|
||||||
|
var runSummary benchmarkPowerCalibrationRunSummary
|
||||||
if calibDurationSec <= 0 {
|
if calibDurationSec <= 0 {
|
||||||
calibDurationSec = 120
|
calibDurationSec = 120
|
||||||
}
|
}
|
||||||
const maxDerateW = 150
|
|
||||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||||
const calibSearchTolerance = 10
|
const calibSearchTolerance = 10
|
||||||
@@ -3058,12 +3283,12 @@ func runBenchmarkPowerCalibration(
|
|||||||
if engine == BenchmarkPowerEngineTargetedPower {
|
if engine == BenchmarkPowerEngineTargetedPower {
|
||||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
|
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
|
||||||
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
|
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
|
||||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if killed := KillTestWorkers(); len(killed) > 0 {
|
if killed := KillTestWorkers(); len(killed) > 0 {
|
||||||
@@ -3090,8 +3315,9 @@ func runBenchmarkPowerCalibration(
|
|||||||
originalLimitW int
|
originalLimitW int
|
||||||
appliedLimitW int
|
appliedLimitW int
|
||||||
minLimitW int
|
minLimitW int
|
||||||
lo int // highest verified-stable limit (assumed: minLimitW)
|
lo int // highest verified-stable limit
|
||||||
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
||||||
|
loVerified bool
|
||||||
calib benchmarkPowerCalibrationResult
|
calib benchmarkPowerCalibrationResult
|
||||||
converged bool
|
converged bool
|
||||||
}
|
}
|
||||||
@@ -3113,23 +3339,17 @@ func runBenchmarkPowerCalibration(
|
|||||||
if defaultLimitW <= 0 {
|
if defaultLimitW <= 0 {
|
||||||
defaultLimitW = originalLimitW
|
defaultLimitW = originalLimitW
|
||||||
}
|
}
|
||||||
appliedLimitW := originalLimitW
|
appliedLimitW := initialBenchmarkCalibrationLimitW(info)
|
||||||
if appliedLimitW <= 0 {
|
if appliedLimitW <= 0 {
|
||||||
appliedLimitW = defaultLimitW
|
appliedLimitW = defaultLimitW
|
||||||
}
|
}
|
||||||
minLimitW := appliedLimitW
|
minLimitW := int(math.Round(info.MinPowerLimitW))
|
||||||
switch {
|
if minLimitW <= 0 {
|
||||||
case defaultLimitW > 0:
|
minLimitW = appliedLimitW
|
||||||
minLimitW = defaultLimitW - maxDerateW
|
|
||||||
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
|
|
||||||
if minLimitW < floorByRatio {
|
|
||||||
minLimitW = floorByRatio
|
|
||||||
}
|
|
||||||
case appliedLimitW > 0:
|
|
||||||
minLimitW = appliedLimitW - maxDerateW
|
|
||||||
}
|
}
|
||||||
if minLimitW < calibSearchTolerance {
|
maxLimitW := int(math.Round(info.MaxPowerLimitW))
|
||||||
minLimitW = calibSearchTolerance
|
if maxLimitW > 0 && appliedLimitW > maxLimitW {
|
||||||
|
appliedLimitW = maxLimitW
|
||||||
}
|
}
|
||||||
s := &gpuCalibState{
|
s := &gpuCalibState{
|
||||||
idx: idx,
|
idx: idx,
|
||||||
@@ -3141,11 +3361,24 @@ func runBenchmarkPowerCalibration(
|
|||||||
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
||||||
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
||||||
}
|
}
|
||||||
|
if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
|
||||||
|
s.appliedLimitW = minLimitW
|
||||||
|
s.hi = minLimitW + 1
|
||||||
|
}
|
||||||
|
if info.MinPowerLimitW <= 0 {
|
||||||
|
s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
|
||||||
|
}
|
||||||
if seedLimits != nil {
|
if seedLimits != nil {
|
||||||
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
|
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
|
||||||
// A previously validated limit is only a starting point. Re-run
|
// A previously validated limit is only a starting point. Re-run
|
||||||
// targeted_power under the current multi-GPU thermal load and derate
|
// targeted_power under the current multi-GPU thermal load and derate
|
||||||
// again if this step shows new throttling.
|
// again if this step shows new throttling.
|
||||||
|
if seedW < s.minLimitW {
|
||||||
|
seedW = s.minLimitW
|
||||||
|
}
|
||||||
|
if maxLimitW > 0 && seedW > maxLimitW {
|
||||||
|
seedW = maxLimitW
|
||||||
|
}
|
||||||
if canDerate {
|
if canDerate {
|
||||||
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
|
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
|
||||||
}
|
}
|
||||||
@@ -3220,6 +3453,10 @@ calibDone:
|
|||||||
}
|
}
|
||||||
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
||||||
doneCh := make(chan sharedAttemptResult, 1)
|
doneCh := make(chan sharedAttemptResult, 1)
|
||||||
|
sdrStopCh := make(chan struct{})
|
||||||
|
sdrDoneCh := startIPMISDRSampler(sdrStopCh, benchmarkPowerAutotuneSampleInterval)
|
||||||
|
fanStopCh := make(chan struct{})
|
||||||
|
fanDoneCh := startBenchmarkFanSampler(fanStopCh, benchmarkPowerAutotuneSampleInterval)
|
||||||
go func() {
|
go func() {
|
||||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
|
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
|
||||||
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
||||||
@@ -3259,6 +3496,10 @@ calibDone:
|
|||||||
}
|
}
|
||||||
ticker.Stop()
|
ticker.Stop()
|
||||||
cancelAttempt()
|
cancelAttempt()
|
||||||
|
close(sdrStopCh)
|
||||||
|
close(fanStopCh)
|
||||||
|
attemptSDRSummary := summarizeSDRPowerSeries(<-sdrDoneCh)
|
||||||
|
attemptFanSummary := <-fanDoneCh
|
||||||
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
||||||
// Accumulate telemetry rows with attempt stage label.
|
// Accumulate telemetry rows with attempt stage label.
|
||||||
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
|
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
|
||||||
@@ -3296,10 +3537,14 @@ calibDone:
|
|||||||
busyDelaySec = 1
|
busyDelaySec = 1
|
||||||
|
|
||||||
// Per-GPU analysis and binary search update.
|
// Per-GPU analysis and binary search update.
|
||||||
|
attemptStable := ar.err == nil
|
||||||
for _, s := range active {
|
for _, s := range active {
|
||||||
perGPU := filterRowsByGPU(ar.rows, s.idx)
|
perGPU := filterRowsByGPU(ar.rows, s.idx)
|
||||||
summary := summarizeBenchmarkTelemetry(perGPU)
|
summary := summarizeBenchmarkTelemetry(perGPU)
|
||||||
throttle := throttleReasons[s.idx]
|
throttle := throttleReasons[s.idx]
|
||||||
|
if throttle != "" || summary.P95PowerW <= 0 {
|
||||||
|
attemptStable = false
|
||||||
|
}
|
||||||
|
|
||||||
// Cooling warning: thermal throttle with fans not at maximum.
|
// Cooling warning: thermal throttle with fans not at maximum.
|
||||||
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
|
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
|
||||||
@@ -3333,6 +3578,7 @@ calibDone:
|
|||||||
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||||||
s.lo = s.appliedLimitW
|
s.lo = s.appliedLimitW
|
||||||
|
s.loVerified = true
|
||||||
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
||||||
next := roundTo5W((s.lo + s.hi) / 2)
|
next := roundTo5W((s.lo + s.hi) / 2)
|
||||||
if next > s.lo && next < s.hi {
|
if next > s.lo && next < s.hi {
|
||||||
@@ -3371,7 +3617,23 @@ calibDone:
|
|||||||
s.hi = s.appliedLimitW
|
s.hi = s.appliedLimitW
|
||||||
|
|
||||||
if s.hi-s.lo <= calibSearchTolerance {
|
if s.hi-s.lo <= calibSearchTolerance {
|
||||||
if s.lo > s.minLimitW {
|
if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
|
||||||
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
|
||||||
|
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
|
||||||
|
s.converged = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.appliedLimitW = s.minLimitW
|
||||||
|
s.calib.AppliedPowerLimitW = float64(s.minLimitW)
|
||||||
|
s.calib.Derated = s.minLimitW < s.originalLimitW
|
||||||
|
s.info.PowerLimitW = float64(s.minLimitW)
|
||||||
|
infoByIndex[s.idx] = s.info
|
||||||
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if s.loVerified {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
||||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
||||||
s.appliedLimitW = s.lo
|
s.appliedLimitW = s.lo
|
||||||
@@ -3383,7 +3645,8 @@ calibDone:
|
|||||||
s.calib.Completed = true
|
s.calib.Completed = true
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
|
||||||
}
|
}
|
||||||
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||||
s.converged = true
|
s.converged = true
|
||||||
@@ -3398,9 +3661,7 @@ calibDone:
|
|||||||
next = (s.lo + s.hi) / 2
|
next = (s.lo + s.hi) / 2
|
||||||
}
|
}
|
||||||
if next < s.minLimitW {
|
if next < s.minLimitW {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
next = s.minLimitW
|
||||||
s.converged = true
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
||||||
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||||
@@ -3416,6 +3677,16 @@ calibDone:
|
|||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
|
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
|
||||||
}
|
}
|
||||||
|
if attemptStable {
|
||||||
|
if attemptSDRSummary.Samples > 0 {
|
||||||
|
runSummary.LoadedSDR = attemptSDRSummary
|
||||||
|
}
|
||||||
|
if attemptFanSummary.FanSamples > 0 {
|
||||||
|
runSummary.AvgFanRPM = attemptFanSummary.AvgFanRPM
|
||||||
|
runSummary.AvgFanDutyCyclePct = attemptFanSummary.AvgFanDutyCyclePct
|
||||||
|
runSummary.FanSamples = attemptFanSummary.FanSamples
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, s := range states {
|
for _, s := range states {
|
||||||
@@ -3424,7 +3695,7 @@ calibDone:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
writeBenchmarkMetricsFiles(runDir, allCalibRows)
|
writeBenchmarkMetricsFiles(runDir, allCalibRows)
|
||||||
return results, restore, allCalibRows
|
return results, restore, allCalibRows, runSummary
|
||||||
}
|
}
|
||||||
|
|
||||||
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
||||||
@@ -3439,6 +3710,24 @@ func roundTo5W(w int) int {
|
|||||||
return ((w + 2) / 5) * 5
|
return ((w + 2) / 5) * 5
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func initialBenchmarkCalibrationLimitW(info benchmarkGPUInfo) int {
|
||||||
|
defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
|
||||||
|
currentLimitW := int(math.Round(info.PowerLimitW))
|
||||||
|
maxLimitW := int(math.Round(info.MaxPowerLimitW))
|
||||||
|
|
||||||
|
startW := defaultLimitW
|
||||||
|
if startW <= 0 {
|
||||||
|
startW = currentLimitW
|
||||||
|
}
|
||||||
|
if startW <= 0 {
|
||||||
|
startW = maxLimitW
|
||||||
|
}
|
||||||
|
if maxLimitW > 0 && startW > maxLimitW {
|
||||||
|
startW = maxLimitW
|
||||||
|
}
|
||||||
|
return startW
|
||||||
|
}
|
||||||
|
|
||||||
// meanFanRPM returns the average RPM across a set of fan readings.
|
// meanFanRPM returns the average RPM across a set of fan readings.
|
||||||
func meanFanRPM(fans []FanReading) float64 {
|
func meanFanRPM(fans []FanReading) float64 {
|
||||||
if len(fans) == 0 {
|
if len(fans) == 0 {
|
||||||
@@ -3451,6 +3740,47 @@ func meanFanRPM(fans []FanReading) float64 {
|
|||||||
return sum / float64(len(fans))
|
return sum / float64(len(fans))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func startBenchmarkFanSampler(stopCh <-chan struct{}, intervalSec int) <-chan benchmarkPowerCalibrationRunSummary {
|
||||||
|
if intervalSec <= 0 {
|
||||||
|
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||||
|
}
|
||||||
|
ch := make(chan benchmarkPowerCalibrationRunSummary, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ch)
|
||||||
|
var rpmSamples []float64
|
||||||
|
var dutySamples []float64
|
||||||
|
record := func() {
|
||||||
|
fans, err := sampleFanSpeeds()
|
||||||
|
if err != nil || len(fans) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if rpm := meanFanRPM(fans); rpm > 0 {
|
||||||
|
rpmSamples = append(rpmSamples, rpm)
|
||||||
|
}
|
||||||
|
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok && duty > 0 {
|
||||||
|
dutySamples = append(dutySamples, duty)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
record()
|
||||||
|
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
ch <- benchmarkPowerCalibrationRunSummary{
|
||||||
|
AvgFanRPM: benchmarkMean(rpmSamples),
|
||||||
|
AvgFanDutyCyclePct: benchmarkMean(dutySamples),
|
||||||
|
FanSamples: len(rpmSamples),
|
||||||
|
}
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
record()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
|
||||||
func powerBenchDurationSec(profile string) int {
|
func powerBenchDurationSec(profile string) int {
|
||||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||||
case NvidiaBenchmarkProfileStability:
|
case NvidiaBenchmarkProfileStability:
|
||||||
@@ -3479,41 +3809,39 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
|
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
|
||||||
if sp := result.ServerPower; sp != nil && sp.Available {
|
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||||
fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W \n", sp.DeltaW)
|
sourceLabel := "autotuned source"
|
||||||
if sp.PSUInputLoadedW > 0 {
|
switch normalizeBenchmarkPowerSource(sp.Source) {
|
||||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
case BenchmarkPowerSourceSDRPSUInput:
|
||||||
fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W \n", psuDelta)
|
sourceLabel = "autotuned source (SDR PSU AC input)"
|
||||||
|
case BenchmarkPowerSourceDCMI:
|
||||||
|
sourceLabel = "autotuned source (DCMI)"
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f \n", sp.ReportingRatio)
|
fmt.Fprintf(&b, "**Server power delta (%s):** %.0f W \n", sourceLabel, sp.DeltaW)
|
||||||
|
fmt.Fprintf(&b, "**Reporting ratio:** %.2f \n", sp.ReportingRatio)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
// Server power comparison table.
|
// Server power comparison table.
|
||||||
if sp := result.ServerPower; sp != nil {
|
if sp := result.ServerPower; sp != nil {
|
||||||
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
||||||
|
selectedSource := normalizeBenchmarkPowerSource(sp.Source)
|
||||||
|
selectedSourceLabel := "Selected source"
|
||||||
|
if selectedSource == BenchmarkPowerSourceSDRPSUInput {
|
||||||
|
selectedSourceLabel = "Selected source (SDR PSU AC input)"
|
||||||
|
} else if selectedSource == BenchmarkPowerSourceDCMI {
|
||||||
|
selectedSourceLabel = "Selected source (DCMI)"
|
||||||
|
}
|
||||||
var spRows [][]string
|
var spRows [][]string
|
||||||
spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
|
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
|
||||||
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
|
|
||||||
if sp.GPUSlotTotalW > 0 {
|
|
||||||
spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
|
|
||||||
}
|
|
||||||
if sp.Available {
|
if sp.Available {
|
||||||
spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
|
spRows = append(spRows, []string{selectedSourceLabel + " idle power", fmt.Sprintf("%.0f W", sp.IdleW)})
|
||||||
spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
|
spRows = append(spRows, []string{selectedSourceLabel + " loaded power", fmt.Sprintf("%.0f W", sp.LoadedW)})
|
||||||
spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
|
spRows = append(spRows, []string{selectedSourceLabel + " Δ power (loaded − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)})
|
||||||
}
|
}
|
||||||
if sp.PSUInputLoadedW > 0 {
|
if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 {
|
||||||
spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
|
spRows = append(spRows, []string{"PSU AC input (idle avg, pre-load phase)", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
|
||||||
spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
|
spRows = append(spRows, []string{"PSU AC input (loaded avg, final phase)", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
|
||||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||||
spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
|
spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", fmt.Sprintf("%.0f W", psuDelta)})
|
||||||
}
|
|
||||||
if sp.PSUOutputLoadedW > 0 {
|
|
||||||
spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
|
|
||||||
spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
|
|
||||||
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
|
|
||||||
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
|
|
||||||
spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if sp.Available {
|
if sp.Available {
|
||||||
ratio := sp.ReportingRatio
|
ratio := sp.ReportingRatio
|
||||||
@@ -3530,8 +3858,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
default:
|
default:
|
||||||
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
||||||
}
|
}
|
||||||
spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
|
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
|
||||||
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
||||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||||
sdrRatio := psuDelta / sp.GPUReportedSumW
|
sdrRatio := psuDelta / sp.GPUReportedSumW
|
||||||
sdrNote := ""
|
sdrNote := ""
|
||||||
@@ -3543,12 +3871,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
default:
|
default:
|
||||||
sdrNote = "✗ significant discrepancy"
|
sdrNote = "✗ significant discrepancy"
|
||||||
}
|
}
|
||||||
spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
|
spRows = append(spRows, []string{"PSU AC input reporting ratio", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
|
spRows = append(spRows, []string{"IPMI availability", "not available — IPMI not supported or ipmitool not found"})
|
||||||
}
|
}
|
||||||
b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
|
b.WriteString(fmtMDTable([]string{"Metric", "Value"}, spRows))
|
||||||
for _, note := range sp.Notes {
|
for _, note := range sp.Notes {
|
||||||
fmt.Fprintf(&b, "\n> %s\n", note)
|
fmt.Fprintf(&b, "\n> %s\n", note)
|
||||||
}
|
}
|
||||||
@@ -3600,11 +3928,10 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
psuDistRows = append(psuDistRows, []string{
|
psuDistRows = append(psuDistRows, []string{
|
||||||
slot,
|
slot,
|
||||||
fmtW(idle.InputW), fmtW(loaded.InputW),
|
fmtW(idle.InputW), fmtW(loaded.InputW),
|
||||||
fmtW(idle.OutputW), fmtW(loaded.OutputW),
|
|
||||||
deltaStr, status,
|
deltaStr, status,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
|
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle avg)", "AC Input (loaded avg)", "Load Δ", "Status"}, psuDistRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3652,7 +3979,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
fan,
|
fan,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
|
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Avg Fan RPM (duty%)"}, sgRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
@@ -3761,7 +4088,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
for _, slot := range psuSlots {
|
for _, slot := range psuSlots {
|
||||||
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
|
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
|
||||||
}
|
}
|
||||||
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
|
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Avg Fan RPM (duty%)")
|
||||||
|
|
||||||
var psuRows [][]string
|
var psuRows [][]string
|
||||||
for _, step := range result.RampSteps {
|
for _, step := range result.RampSteps {
|
||||||
@@ -3842,7 +4169,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
}
|
}
|
||||||
pdRows = append(pdRows, []string{
|
pdRows = append(pdRows, []string{
|
||||||
fmt.Sprintf("GPU %d", gpu.Index),
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
|
|
||||||
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
|
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
|
||||||
fmt.Sprintf("%.0f W", stable),
|
fmt.Sprintf("%.0f W", stable),
|
||||||
realization,
|
realization,
|
||||||
@@ -3855,13 +4181,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
}
|
}
|
||||||
pdRows = append(pdRows, []string{
|
pdRows = append(pdRows, []string{
|
||||||
"**Platform**",
|
"**Platform**",
|
||||||
fmt.Sprintf("**%.0f W**", totalDefault),
|
|
||||||
"—",
|
"—",
|
||||||
fmt.Sprintf("**%.0f W**", totalStable),
|
fmt.Sprintf("**%.0f W**", totalStable),
|
||||||
fmt.Sprintf("**%s**", platformReal),
|
fmt.Sprintf("**%s**", platformReal),
|
||||||
"",
|
"",
|
||||||
})
|
})
|
||||||
b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
|
b.WriteString(fmtMDTable([]string{"GPU", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
// Balance across GPUs — only meaningful with 2+ GPUs.
|
// Balance across GPUs — only meaningful with 2+ GPUs.
|
||||||
@@ -4011,7 +4336,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
{"Avg Temp °C", singleTemp},
|
{"Avg Temp °C", singleTemp},
|
||||||
{"Power W", singlePwr},
|
{"Power W", singlePwr},
|
||||||
{"Per GPU wall W", singleWall},
|
{"Per GPU wall W", singleWall},
|
||||||
{"Fan RPM (duty%)", singleFan},
|
{"Avg Fan RPM (duty%)", singleFan},
|
||||||
}
|
}
|
||||||
if lastStep != nil {
|
if lastStep != nil {
|
||||||
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
|
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
|
||||||
@@ -4096,14 +4421,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||||
}
|
}
|
||||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
|
||||||
if infoErr != nil {
|
|
||||||
return "", infoErr
|
|
||||||
}
|
|
||||||
// Capture full nvidia-smi -q snapshot at the start of the run.
|
|
||||||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
|
||||||
}
|
|
||||||
hostname, _ := os.Hostname()
|
hostname, _ := os.Hostname()
|
||||||
result := NvidiaPowerBenchResult{
|
result := NvidiaPowerBenchResult{
|
||||||
BenchmarkVersion: benchmarkVersion,
|
BenchmarkVersion: benchmarkVersion,
|
||||||
@@ -4114,23 +4431,35 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
SelectedGPUIndices: append([]int(nil), selected...),
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
OverallStatus: "OK",
|
OverallStatus: "OK",
|
||||||
}
|
}
|
||||||
|
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||||
|
if infoErr != nil {
|
||||||
|
return "", infoErr
|
||||||
|
}
|
||||||
|
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||||
|
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||||
|
}
|
||||||
durationSec := powerBenchDurationSec(opts.Profile)
|
durationSec := powerBenchDurationSec(opts.Profile)
|
||||||
|
|
||||||
// Sample server idle power before any GPU load.
|
// Sample server idle power before any GPU load.
|
||||||
var serverIdleW float64
|
var serverIdleW float64
|
||||||
var serverIdleOK bool
|
var serverIdleOK bool
|
||||||
|
idleSDRStopCh := make(chan struct{})
|
||||||
|
idleSDRCh := startIPMISDRSampler(idleSDRStopCh, benchmarkPowerAutotuneSampleInterval)
|
||||||
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
|
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
|
||||||
serverIdleW = w
|
serverIdleW = w
|
||||||
serverIdleOK = true
|
serverIdleOK = true
|
||||||
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||||
}
|
}
|
||||||
sdrIdle := sampleIPMISDRPowerSensors()
|
close(idleSDRStopCh)
|
||||||
|
sdrIdle := summarizeSDRPowerSeries(<-idleSDRCh)
|
||||||
psuBefore := psuStatusSnapshot()
|
psuBefore := psuStatusSnapshot()
|
||||||
|
|
||||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||||
singleIPMILoadedW := make(map[int]float64, len(selected))
|
singleIPMILoadedW := make(map[int]float64, len(selected))
|
||||||
|
singleRunSummaryByIndex := make(map[int]benchmarkPowerCalibrationRunSummary, len(selected))
|
||||||
var allRestoreActions []benchmarkRestoreAction
|
var allRestoreActions []benchmarkRestoreAction
|
||||||
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
|
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
|
||||||
var allPowerRows []GPUMetricRow
|
var allPowerRows []GPUMetricRow
|
||||||
@@ -4139,24 +4468,27 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
|
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||||
|
return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
|
||||||
|
}
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
singlePowerStopCh := make(chan struct{})
|
singlePowerStopCh := make(chan struct{})
|
||||||
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
c, restore, singleRows, singleRun := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
||||||
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||||
close(singlePowerStopCh)
|
close(singlePowerStopCh)
|
||||||
sdrSingle := sampleIPMISDRPowerSensors()
|
|
||||||
if samples := <-singlePowerCh; len(samples) > 0 {
|
if samples := <-singlePowerCh; len(samples) > 0 {
|
||||||
singleIPMILoadedW[idx] = benchmarkMean(samples)
|
singleIPMILoadedW[idx] = benchmarkMean(samples)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
|
||||||
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
|
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && singleRun.LoadedSDR.PSUInW > 0 {
|
||||||
singleIPMILoadedW[idx] = sdrSingle.PSUInW
|
singleIPMILoadedW[idx] = singleRun.LoadedSDR.PSUInW
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR avg): %.0f W", idx, singleRun.LoadedSDR.PSUInW))
|
||||||
}
|
}
|
||||||
allRestoreActions = append(allRestoreActions, restore...)
|
allRestoreActions = append(allRestoreActions, restore...)
|
||||||
if r, ok := c[idx]; ok {
|
if r, ok := c[idx]; ok {
|
||||||
calibByIndex[idx] = r
|
calibByIndex[idx] = r
|
||||||
}
|
}
|
||||||
|
singleRunSummaryByIndex[idx] = singleRun
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
for i := len(allRestoreActions) - 1; i >= 0; i-- {
|
for i := len(allRestoreActions) - 1; i >= 0; i-- {
|
||||||
@@ -4199,11 +4531,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
||||||
gpu.Telemetry = &t
|
gpu.Telemetry = &t
|
||||||
}
|
}
|
||||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
if singleRun := singleRunSummaryByIndex[idx]; singleRun.AvgFanRPM > 0 {
|
||||||
gpu.AvgFanRPM = meanFanRPM(fans)
|
gpu.AvgFanRPM = singleRun.AvgFanRPM
|
||||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
gpu.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
|
||||||
gpu.AvgFanDutyCyclePct = duty
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
gpus = append(gpus, gpu)
|
gpus = append(gpus, gpu)
|
||||||
}
|
}
|
||||||
@@ -4259,10 +4589,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
|
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
|
||||||
var serverLoadedW float64
|
var serverLoadedW float64
|
||||||
var serverLoadedOK bool
|
var serverLoadedOK bool
|
||||||
// sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are
|
// sdrLastStep retains the phase-averaged SDR readings from the last ramp step
|
||||||
// still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling
|
// while GPUs are loaded. Used in the summary instead of re-sampling after the
|
||||||
// after the test when GPUs have already returned to idle.
|
// test when GPUs have already returned to idle.
|
||||||
var sdrLastStep sdrPowerSnapshot
|
var sdrLastStep benchmarkSDRSeriesSummary
|
||||||
|
|
||||||
// Step 1: reuse single-card calibration result directly.
|
// Step 1: reuse single-card calibration result directly.
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
@@ -4283,6 +4613,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
ramp.ServerLoadedW = w
|
ramp.ServerLoadedW = w
|
||||||
ramp.ServerDeltaW = w - serverIdleW
|
ramp.ServerDeltaW = w - serverIdleW
|
||||||
}
|
}
|
||||||
|
if singleRun := singleRunSummaryByIndex[firstIdx]; singleRun.AvgFanRPM > 0 {
|
||||||
|
ramp.AvgFanRPM = singleRun.AvgFanRPM
|
||||||
|
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
|
||||||
|
}
|
||||||
if !firstCalib.Completed {
|
if !firstCalib.Completed {
|
||||||
ramp.Status = "FAILED"
|
ramp.Status = "FAILED"
|
||||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||||
@@ -4333,7 +4667,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
stepPowerStopCh := make(chan struct{})
|
stepPowerStopCh := make(chan struct{})
|
||||||
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
stepCalib, stepRestore, stepRows, stepRun := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
||||||
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||||
close(stepPowerStopCh)
|
close(stepPowerStopCh)
|
||||||
var stepIPMILoadedW float64
|
var stepIPMILoadedW float64
|
||||||
@@ -4404,10 +4738,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Per-step PSU slot snapshot — also used as the authoritative loaded power
|
// Per-step PSU slot readings are averaged over the whole load phase rather
|
||||||
// source when SDR PSU sensors are available (more accurate than DCMI on
|
// than captured as a single end-of-phase snapshot.
|
||||||
// servers where DCMI covers only a subset of installed PSUs).
|
sdrStep := stepRun.LoadedSDR
|
||||||
sdrStep := sampleIPMISDRPowerSensors()
|
|
||||||
if len(sdrStep.PSUSlots) > 0 {
|
if len(sdrStep.PSUSlots) > 0 {
|
||||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||||
}
|
}
|
||||||
@@ -4425,7 +4758,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
|
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
|
||||||
ramp.ServerLoadedW = sdrStep.PSUInW
|
ramp.ServerLoadedW = sdrStep.PSUInW
|
||||||
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
||||||
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
|
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR avg): %.0f W", step, sdrStep.PSUInW))
|
||||||
if step == len(result.RecommendedSlotOrder) {
|
if step == len(result.RecommendedSlotOrder) {
|
||||||
serverLoadedW = sdrStep.PSUInW
|
serverLoadedW = sdrStep.PSUInW
|
||||||
serverLoadedOK = true
|
serverLoadedOK = true
|
||||||
@@ -4433,12 +4766,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fan state at end of ramp step.
|
// Fan values are phase averages over the same load window.
|
||||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
if stepRun.AvgFanRPM > 0 {
|
||||||
ramp.AvgFanRPM = meanFanRPM(fans)
|
ramp.AvgFanRPM = stepRun.AvgFanRPM
|
||||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
ramp.AvgFanDutyCyclePct = stepRun.AvgFanDutyCyclePct
|
||||||
ramp.AvgFanDutyCyclePct = duty
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Per-GPU telemetry from this ramp step's calibration.
|
// Per-GPU telemetry from this ramp step's calibration.
|
||||||
@@ -4491,8 +4822,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
||||||
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
||||||
if result.ServerPower != nil {
|
if result.ServerPower != nil {
|
||||||
// Use the SDR snapshot from the last ramp step (GPUs still loaded) rather
|
// Use the SDR phase average from the last ramp step (GPUs still loaded)
|
||||||
// than re-sampling here, which would capture post-test idle state.
|
// rather than re-sampling here, which would capture post-test idle state.
|
||||||
sdrLoaded := sdrLastStep
|
sdrLoaded := sdrLastStep
|
||||||
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
|
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
|
||||||
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
|
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
|
||||||
@@ -4512,6 +4843,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.ServerPower.Notes = append(result.ServerPower.Notes,
|
result.ServerPower.Notes = append(result.ServerPower.Notes,
|
||||||
"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
|
"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
|
||||||
}
|
}
|
||||||
|
if sdrLoaded.Samples > 0 {
|
||||||
|
result.ServerPower.Notes = append(result.ServerPower.Notes,
|
||||||
|
fmt.Sprintf("Final SDR PSU loaded values are phase averages across %d sample(s) from the last full-load step.", sdrLoaded.Samples))
|
||||||
|
}
|
||||||
// Detect DCMI partial coverage: direct SDR comparison first,
|
// Detect DCMI partial coverage: direct SDR comparison first,
|
||||||
// ramp heuristic as fallback when SDR PSU sensors are absent.
|
// ramp heuristic as fallback when SDR PSU sensors are absent.
|
||||||
dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
|
dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||||
@@ -164,6 +169,99 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
before := BenchmarkThrottleCounters{}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
|
||||||
|
t.Fatalf("sw_power_cap should be ignored, got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
|
||||||
|
t.Fatalf("hw_power_brake should be ignored, got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
|
||||||
|
t.Fatalf("hw_thermal mismatch: got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
|
||||||
|
t.Fatalf("sw_thermal mismatch: got %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
|
benchmarkGeteuid = func() int { return 1000 }
|
||||||
|
benchmarkResetNvidiaGPU = func(int) (string, error) {
|
||||||
|
t.Fatal("unexpected reset call")
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
|
})
|
||||||
|
|
||||||
|
var logs []string
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
|
||||||
|
logs = append(logs, line)
|
||||||
|
})
|
||||||
|
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
|
||||||
|
t.Fatalf("logs=%q want substring %q", got, want)
|
||||||
|
}
|
||||||
|
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
|
||||||
|
t.Fatalf("failed=%v want [0 2]", failed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldSleep := benchmarkSleep
|
||||||
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
|
benchmarkSleep = func(time.Duration) {}
|
||||||
|
var calls []int
|
||||||
|
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||||
|
calls = append(calls, index)
|
||||||
|
return "ok\n", nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkSleep = oldSleep
|
||||||
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
|
})
|
||||||
|
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||||
|
if len(failed) != 0 {
|
||||||
|
t.Fatalf("failed=%v want no failures", failed)
|
||||||
|
}
|
||||||
|
if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
|
||||||
|
t.Fatalf("calls=%v want %s", calls, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldSleep := benchmarkSleep
|
||||||
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
|
benchmarkSleep = func(time.Duration) {}
|
||||||
|
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||||
|
if index == 5 {
|
||||||
|
return "busy\n", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
return "ok\n", nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkSleep = oldSleep
|
||||||
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
|
})
|
||||||
|
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||||
|
if got, want := fmt.Sprint(failed), "[5]"; got != want {
|
||||||
|
t.Fatalf("failed=%v want %s", failed, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -179,6 +277,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
info benchmarkGPUInfo
|
||||||
|
want int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "prefers default tdp over current derated limit",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 500,
|
||||||
|
DefaultPowerLimitW: 600,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
|
want: 600,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "caps default tdp to reported max limit",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 500,
|
||||||
|
DefaultPowerLimitW: 700,
|
||||||
|
MaxPowerLimitW: 650,
|
||||||
|
},
|
||||||
|
want: 650,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "falls back to current limit when default missing",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 525,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
|
want: 525,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "falls back to max limit when only that is known",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
MaxPowerLimitW: 575,
|
||||||
|
},
|
||||||
|
want: 575,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
tc := tc
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
|
||||||
|
t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -338,12 +489,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
nvsmiQ := []byte(`
|
nvsmiQ := []byte(`
|
||||||
GPU 00000000:4E:00.0
|
GPU 00000000:4E:00.0
|
||||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||||
|
Min Power Limit : 200.00 W
|
||||||
|
Max Power Limit : 600.00 W
|
||||||
|
Default Power Limit : 575.00 W
|
||||||
|
Current Power Limit : 560.00 W
|
||||||
Clocks
|
Clocks
|
||||||
Graphics : 2422 MHz
|
Graphics : 2422 MHz
|
||||||
Memory : 12481 MHz
|
Memory : 12481 MHz
|
||||||
@@ -365,7 +520,7 @@ GPU 00000000:4F:00.0
|
|||||||
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||||
}
|
}
|
||||||
|
|
||||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
@@ -379,25 +534,49 @@ GPU 00000000:4F:00.0
|
|||||||
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||||
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||||
}
|
}
|
||||||
|
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||||
|
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].MaxPowerLimitW != 600 {
|
||||||
|
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].DefaultPowerLimitW != 575 {
|
||||||
|
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].PowerLimitW != 560 {
|
||||||
|
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
|
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
nvsmiQ := []byte(`
|
nvsmiQ := []byte(`
|
||||||
GPU 00000000:4E:00.0
|
GPU 00000000:4E:00.0
|
||||||
|
Min Power Limit : 100.00 W
|
||||||
|
Max Power Limit : 900.00 W
|
||||||
Max Clocks
|
Max Clocks
|
||||||
Graphics : 9999 MHz
|
Graphics : 9999 MHz
|
||||||
Memory : 9999 MHz
|
Memory : 9999 MHz
|
||||||
`)
|
`)
|
||||||
// Already populated — must not be overwritten.
|
// Already populated — must not be overwritten.
|
||||||
infoByIndex := map[int]benchmarkGPUInfo{
|
infoByIndex := map[int]benchmarkGPUInfo{
|
||||||
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
|
0: {
|
||||||
|
Index: 0,
|
||||||
|
BusID: "00000000:4E:00.0",
|
||||||
|
MaxGraphicsClockMHz: 2430,
|
||||||
|
MaxMemoryClockMHz: 12481,
|
||||||
|
MinPowerLimitW: 200,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
}
|
}
|
||||||
|
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||||
|
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ var workerPatterns = []string{
|
|||||||
"stress-ng",
|
"stress-ng",
|
||||||
"stressapptest",
|
"stressapptest",
|
||||||
"memtester",
|
"memtester",
|
||||||
|
"nvbandwidth",
|
||||||
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||||
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||||
"nvvs",
|
"nvvs",
|
||||||
@@ -71,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
|
|||||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||||
base = exe[idx+1:]
|
base = exe[idx+1:]
|
||||||
}
|
}
|
||||||
for _, pat := range workerPatterns {
|
if shouldKillWorkerProcess(exe, base) {
|
||||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return killed
|
return killed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func shouldKillWorkerProcess(exe, base string) bool {
|
||||||
|
for _, pat := range workerPatterns {
|
||||||
|
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|||||||
39
audit/internal/platform/kill_workers_test.go
Normal file
39
audit/internal/platform/kill_workers_test.go
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestShouldKillWorkerProcess(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
exe string
|
||||||
|
base string
|
||||||
|
want bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "nvbandwidth executable",
|
||||||
|
exe: "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
|
||||||
|
base: "nvbandwidth",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "dcgmi executable",
|
||||||
|
exe: "/usr/bin/dcgmi",
|
||||||
|
base: "dcgmi",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unrelated process",
|
||||||
|
exe: "/usr/bin/bash",
|
||||||
|
base: "bash",
|
||||||
|
want: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
|
||||||
|
t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
51
audit/internal/platform/nvidia_recover.go
Normal file
51
audit/internal/platform/nvidia_recover.go
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
|
||||||
|
|
||||||
|
func runNvidiaRecover(args ...string) (string, error) {
|
||||||
|
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
|
||||||
|
if _, err := exec.LookPath("systemd-run"); err == nil {
|
||||||
|
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
|
||||||
|
cmdArgs := []string{
|
||||||
|
"systemd-run",
|
||||||
|
"--quiet",
|
||||||
|
"--pipe",
|
||||||
|
"--wait",
|
||||||
|
"--collect",
|
||||||
|
"--service-type=oneshot",
|
||||||
|
"--unit", unit,
|
||||||
|
}
|
||||||
|
cmdArgs = append(cmdArgs, helperArgs...)
|
||||||
|
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
|
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
|
|
||||||
|
func resetNvidiaGPU(index int) (string, error) {
|
||||||
|
if index < 0 {
|
||||||
|
return "", fmt.Errorf("gpu index must be >= 0")
|
||||||
|
}
|
||||||
|
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||||
|
if strings.TrimSpace(out) == "" && err == nil {
|
||||||
|
out = "GPU reset completed.\n"
|
||||||
|
}
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func restartNvidiaDrivers() (string, error) {
|
||||||
|
out, err := runNvidiaRecover("restart-drivers")
|
||||||
|
if strings.TrimSpace(out) == "" && err == nil {
|
||||||
|
out = "NVIDIA drivers restarted.\n"
|
||||||
|
}
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
@@ -30,10 +30,10 @@ import (
|
|||||||
// Sources:
|
// Sources:
|
||||||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||||
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
// - SATEstimatedNvidiaGPUValidateSec: xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
// - SATEstimatedNvidiaGPUStressSec: xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
// - SATEstimatedNvidiaTargetedStressSec: xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
// - SATEstimatedNvidiaTargetedPowerSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||||
@@ -48,15 +48,15 @@ const (
|
|||||||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||||
SATEstimatedMemoryStressSec = 140
|
SATEstimatedMemoryStressSec = 140
|
||||||
|
|
||||||
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
SATEstimatedNvidiaGPUValidateSec = 85
|
||||||
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
SATEstimatedNvidiaGPUStressSec = 450
|
||||||
|
|
||||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
SATEstimatedNvidiaTargetedStressSec = 350
|
||||||
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
SATEstimatedNvidiaTargetedPowerSec = 350
|
||||||
|
|
||||||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||||
SATEstimatedNvidiaPulseTestSec = 5000
|
SATEstimatedNvidiaPulseTestSec = 5000
|
||||||
@@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||||
if index < 0 {
|
return resetNvidiaGPU(index)
|
||||||
return "", fmt.Errorf("gpu index must be >= 0")
|
|
||||||
}
|
|
||||||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
|
||||||
if len(raw) == 0 && err == nil {
|
|
||||||
raw = []byte("GPU reset completed.\n")
|
|
||||||
}
|
|
||||||
return string(raw), err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||||
|
|||||||
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
|
if name == "bee-nvidia" && action == ServiceRestart {
|
||||||
|
return restartNvidiaDrivers()
|
||||||
|
}
|
||||||
// bee-web runs as the bee user; sudo is required to control system services.
|
// bee-web runs as the bee user; sudo is required to control system services.
|
||||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||||
|
|||||||
@@ -66,6 +66,7 @@ type HardwareSnapshot struct {
|
|||||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||||
|
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareHealthSummary struct {
|
type HardwareHealthSummary struct {
|
||||||
@@ -143,30 +144,33 @@ type HardwareMemory struct {
|
|||||||
|
|
||||||
type HardwareStorage struct {
|
type HardwareStorage struct {
|
||||||
HardwareComponentStatus
|
HardwareComponentStatus
|
||||||
Slot *string `json:"slot,omitempty"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
Type *string `json:"type,omitempty"`
|
Type *string `json:"type,omitempty"`
|
||||||
Model *string `json:"model,omitempty"`
|
Model *string `json:"model,omitempty"`
|
||||||
SizeGB *int `json:"size_gb,omitempty"`
|
SizeGB *int `json:"size_gb,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number,omitempty"`
|
LogicalBlockSizeBytes *int64 `json:"logical_block_size_bytes,omitempty"`
|
||||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
PhysicalBlockSizeBytes *int64 `json:"physical_block_size_bytes,omitempty"`
|
||||||
Firmware *string `json:"firmware,omitempty"`
|
MetadataBytesPerBlock *int64 `json:"metadata_bytes_per_block,omitempty"`
|
||||||
Interface *string `json:"interface,omitempty"`
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
Present *bool `json:"present,omitempty"`
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
Firmware *string `json:"firmware,omitempty"`
|
||||||
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
Interface *string `json:"interface,omitempty"`
|
||||||
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
Present *bool `json:"present,omitempty"`
|
||||||
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
MediaErrors *int64 `json:"media_errors,omitempty"`
|
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||||
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||||
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||||
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||||
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||||
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||||
Telemetry map[string]any `json:"-"`
|
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||||
|
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||||
|
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||||
|
Telemetry map[string]any `json:"-"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwarePCIeDevice struct {
|
type HardwarePCIeDevice struct {
|
||||||
@@ -211,6 +215,7 @@ type HardwarePCIeDevice struct {
|
|||||||
Firmware *string `json:"firmware,omitempty"`
|
Firmware *string `json:"firmware,omitempty"`
|
||||||
MacAddresses []string `json:"mac_addresses,omitempty"`
|
MacAddresses []string `json:"mac_addresses,omitempty"`
|
||||||
Present *bool `json:"present,omitempty"`
|
Present *bool `json:"present,omitempty"`
|
||||||
|
IOMMUGroup *int `json:"iommu_group,omitempty"`
|
||||||
Telemetry map[string]any `json:"-"`
|
Telemetry map[string]any `json:"-"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -44,3 +44,57 @@ func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
|
|||||||
t.Fatalf("missing event_logs payload: %s", text)
|
t.Fatalf("missing event_logs payload: %s", text)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
|
||||||
|
powerOnHours := int64(12450)
|
||||||
|
writtenBytes := int64(9876543210)
|
||||||
|
readBytes := int64(1234567890)
|
||||||
|
lifeRemainingPct := 91.0
|
||||||
|
logicalBlockSizeBytes := int64(512)
|
||||||
|
physicalBlockSizeBytes := int64(4096)
|
||||||
|
metadataBytesPerBlock := int64(8)
|
||||||
|
|
||||||
|
payload := HardwareIngestRequest{
|
||||||
|
CollectedAt: "2026-03-15T15:00:00Z",
|
||||||
|
Hardware: HardwareSnapshot{
|
||||||
|
Board: HardwareBoard{SerialNumber: "SRV-001"},
|
||||||
|
Storage: []HardwareStorage{
|
||||||
|
{
|
||||||
|
SerialNumber: stringPtr("DISK-001"),
|
||||||
|
Model: stringPtr("TestDisk"),
|
||||||
|
LogicalBlockSizeBytes: &logicalBlockSizeBytes,
|
||||||
|
PhysicalBlockSizeBytes: &physicalBlockSizeBytes,
|
||||||
|
MetadataBytesPerBlock: &metadataBytesPerBlock,
|
||||||
|
PowerOnHours: &powerOnHours,
|
||||||
|
WrittenBytes: &writtenBytes,
|
||||||
|
ReadBytes: &readBytes,
|
||||||
|
LifeRemainingPct: &lifeRemainingPct,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := json.Marshal(payload)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("marshal: %v", err)
|
||||||
|
}
|
||||||
|
text := string(data)
|
||||||
|
for _, needle := range []string{
|
||||||
|
`"storage":[{`,
|
||||||
|
`"logical_block_size_bytes":512`,
|
||||||
|
`"physical_block_size_bytes":4096`,
|
||||||
|
`"metadata_bytes_per_block":8`,
|
||||||
|
`"power_on_hours":12450`,
|
||||||
|
`"written_bytes":9876543210`,
|
||||||
|
`"read_bytes":1234567890`,
|
||||||
|
`"life_remaining_pct":91`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(text, needle) {
|
||||||
|
t.Fatalf("missing %q in payload: %s", needle, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func stringPtr(v string) *string {
|
||||||
|
return &v
|
||||||
|
}
|
||||||
|
|||||||
@@ -125,6 +125,8 @@ func defaultTaskPriority(target string, params taskParams) int {
|
|||||||
return taskPriorityInstall
|
return taskPriorityInstall
|
||||||
case "install-to-ram":
|
case "install-to-ram":
|
||||||
return taskPriorityInstallToRAM
|
return taskPriorityInstallToRAM
|
||||||
|
case "nvme-format":
|
||||||
|
return taskPriorityInstall
|
||||||
case "audit":
|
case "audit":
|
||||||
return taskPriorityAudit
|
return taskPriorityAudit
|
||||||
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
||||||
@@ -806,15 +808,14 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job == nil || !t.job.abort() {
|
||||||
t.job.abort()
|
globalQueue.mu.Unlock()
|
||||||
|
writeJSON(w, map[string]string{"status": "not_running"})
|
||||||
|
return
|
||||||
}
|
}
|
||||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
globalQueue.mu.Unlock()
|
||||||
platform.KillTestWorkers()
|
writeJSON(w, map[string]string{"status": "aborting"})
|
||||||
}
|
return
|
||||||
t.Status = TaskCancelled
|
|
||||||
now := time.Now()
|
|
||||||
t.DoneAt = &now
|
|
||||||
}
|
}
|
||||||
globalQueue.mu.Unlock()
|
globalQueue.mu.Unlock()
|
||||||
writeJSON(w, map[string]string{"status": "aborted"})
|
writeJSON(w, map[string]string{"status": "aborted"})
|
||||||
@@ -1039,6 +1040,81 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
|
|||||||
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBlackboxStatus(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
state, err := app.ReadBlackboxState(filepath.Join(h.opts.ExportDir, "blackbox-state.json"))
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
writeJSON(w, app.BlackboxState{Status: "disabled", Targets: []app.BlackboxTargetStatus{}})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if state.Targets == nil {
|
||||||
|
state.Targets = []app.BlackboxTargetStatus{}
|
||||||
|
}
|
||||||
|
writeJSON(w, state)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBlackboxEnable(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var target platform.RemovableTarget
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || strings.TrimSpace(target.Device) == "" {
|
||||||
|
writeError(w, http.StatusBadRequest, "device is required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
targets, err := h.opts.App.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
allowed := false
|
||||||
|
for _, candidate := range targets {
|
||||||
|
if candidate.Device == target.Device {
|
||||||
|
target = candidate
|
||||||
|
allowed = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !allowed {
|
||||||
|
writeError(w, http.StatusBadRequest, "device not in removable target list")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
marker, err := app.EnableBlackboxTarget(target)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]any{
|
||||||
|
"status": "ok",
|
||||||
|
"message": "Black-box marker written.",
|
||||||
|
"enrollment_id": marker.EnrollmentID,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBlackboxDisable(w http.ResponseWriter, r *http.Request) {
|
||||||
|
var req struct {
|
||||||
|
Device string `json:"device"`
|
||||||
|
EnrollmentID string `json:"enrollment_id"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := app.DisableBlackboxTarget(req.Device, req.EnrollmentID); err != nil {
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
writeError(w, http.StatusNotFound, "black-box target not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": "ok", "message": "Black-box marker removed."})
|
||||||
|
}
|
||||||
|
|
||||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
||||||
@@ -1221,7 +1297,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
|||||||
var standardTools = []string{
|
var standardTools = []string{
|
||||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||||
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||||
"mstflint", "qrencode",
|
"mstflint",
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ package webui
|
|||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
@@ -44,6 +46,66 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||||
|
|
||||||
|
h.handleAPIBlackboxStatus(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
var state app.BlackboxState
|
||||||
|
if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
|
||||||
|
t.Fatalf("decode state: %v", err)
|
||||||
|
}
|
||||||
|
if state.Status != "disabled" {
|
||||||
|
t.Fatalf("status=%q want disabled", state.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
|
||||||
|
exportDir := t.TempDir()
|
||||||
|
statePath := filepath.Join(exportDir, "blackbox-state.json")
|
||||||
|
if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
|
||||||
|
t.Fatalf("write state: %v", err)
|
||||||
|
}
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||||
|
|
||||||
|
h.handleAPIBlackboxStatus(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
|
||||||
|
t.Fatalf("body=%s", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVMeFormatModes(t *testing.T) {
|
||||||
|
raw := `
|
||||||
|
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||||
|
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||||
|
lbaf 2 : ms:0 lbads:12 rp:0
|
||||||
|
`
|
||||||
|
modes := parseNVMeFormatModes(raw)
|
||||||
|
if len(modes) != 3 {
|
||||||
|
t.Fatalf("modes=%#v want 3 modes", modes)
|
||||||
|
}
|
||||||
|
if modes[0].Mode != 0 || modes[0].DataBytes != 512 || modes[0].MetadataBytes != 0 || !modes[0].InUse {
|
||||||
|
t.Fatalf("mode 0=%#v", modes[0])
|
||||||
|
}
|
||||||
|
if modes[1].Label != "MODE 1 (512+8)" {
|
||||||
|
t.Fatalf("mode 1 label=%q", modes[1].Label)
|
||||||
|
}
|
||||||
|
if modes[2].DataBytes != 4096 || modes[2].MetadataBytes != 0 {
|
||||||
|
t.Fatalf("mode 2=%#v", modes[2])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||||
globalQueue.mu.Lock()
|
globalQueue.mu.Lock()
|
||||||
originalTasks := globalQueue.tasks
|
originalTasks := globalQueue.tasks
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ type jobState struct {
|
|||||||
cancel func() // optional cancel function; nil if job is not cancellable
|
cancel func() // optional cancel function; nil if job is not cancellable
|
||||||
logPath string
|
logPath string
|
||||||
serialPrefix string
|
serialPrefix string
|
||||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||||
logBuf *bufio.Writer
|
logBuf *bufio.Writer
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,13 +53,21 @@ func (j *jobState) abort() bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (j *jobState) append(line string) {
|
func (j *jobState) append(line string) {
|
||||||
|
j.appendWithOptions(line, true, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *jobState) appendFromLog(line string) {
|
||||||
|
j.appendWithOptions(line, false, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
|
||||||
j.mu.Lock()
|
j.mu.Lock()
|
||||||
defer j.mu.Unlock()
|
defer j.mu.Unlock()
|
||||||
j.lines = append(j.lines, line)
|
j.lines = append(j.lines, line)
|
||||||
if j.logPath != "" {
|
if persistLog && j.logPath != "" {
|
||||||
j.writeLogLineLocked(line)
|
j.writeLogLineLocked(line)
|
||||||
}
|
}
|
||||||
if j.serialPrefix != "" {
|
if serialMirror && j.serialPrefix != "" {
|
||||||
taskSerialWriteLine(j.serialPrefix + line)
|
taskSerialWriteLine(j.serialPrefix + line)
|
||||||
}
|
}
|
||||||
for _, ch := range j.subs {
|
for _, ch := range j.subs {
|
||||||
@@ -83,6 +91,7 @@ func (j *jobState) writeLogLineLocked(line string) {
|
|||||||
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||||
}
|
}
|
||||||
_, _ = j.logBuf.WriteString(line + "\n")
|
_, _ = j.logBuf.WriteString(line + "\n")
|
||||||
|
_ = j.logBuf.Flush()
|
||||||
}
|
}
|
||||||
|
|
||||||
// closeLog flushes and closes the log file. Called after all task output is done.
|
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||||
|
|||||||
368
audit/internal/webui/nvme_format.go
Normal file
368
audit/internal/webui/nvme_format.go
Normal file
@@ -0,0 +1,368 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type nvmeFormatMode struct {
|
||||||
|
Mode int `json:"mode"`
|
||||||
|
DataBytes int64 `json:"data_bytes"`
|
||||||
|
MetadataBytes int64 `json:"metadata_bytes"`
|
||||||
|
InUse bool `json:"in_use"`
|
||||||
|
Label string `json:"label"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvmeFormatDisk struct {
|
||||||
|
Device string `json:"device"`
|
||||||
|
Model string `json:"model,omitempty"`
|
||||||
|
Serial string `json:"serial,omitempty"`
|
||||||
|
Size string `json:"size,omitempty"`
|
||||||
|
CurrentMode int `json:"current_mode"`
|
||||||
|
CurrentFormat string `json:"current_format"`
|
||||||
|
Modes []nvmeFormatMode `json:"modes"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvmeListJSON struct {
|
||||||
|
Devices []struct {
|
||||||
|
DevicePath string `json:"DevicePath"`
|
||||||
|
ModelNumber string `json:"ModelNumber"`
|
||||||
|
SerialNumber string `json:"SerialNumber"`
|
||||||
|
PhysicalSize int64 `json:"PhysicalSize"`
|
||||||
|
} `json:"Devices"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
nvmeFormatDeviceRE = regexp.MustCompile(`^/dev/nvme[0-9]+n[0-9]+$`)
|
||||||
|
nvmeLBAFCompactLineRE = regexp.MustCompile(`(?im)^\s*lbaf\s+(\d+)\s*:\s*ms:(\d+)\s+lbads:(\d+).*$`)
|
||||||
|
nvmeLBAFVerboseLineRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+(\d+)\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*$`)
|
||||||
|
nvmeCommandContext = exec.CommandContext
|
||||||
|
nvmeListFormatsTimeout = 20 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
func listNVMeFormatDisks(ctx context.Context) ([]nvmeFormatDisk, error) {
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, nvmeListFormatsTimeout)
|
||||||
|
defer cancel()
|
||||||
|
out, err := nvmeCommandContext(ctx, "nvme", "list", "-o", "json").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var root nvmeListJSON
|
||||||
|
if err := json.Unmarshal(out, &root); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
disks := make([]nvmeFormatDisk, 0, len(root.Devices))
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, dev := range root.Devices {
|
||||||
|
path := strings.TrimSpace(dev.DevicePath)
|
||||||
|
if !nvmeFormatDeviceRE.MatchString(path) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[path]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[path] = struct{}{}
|
||||||
|
disk := nvmeFormatDisk{
|
||||||
|
Device: path,
|
||||||
|
Model: strings.TrimSpace(dev.ModelNumber),
|
||||||
|
Serial: strings.TrimSpace(dev.SerialNumber),
|
||||||
|
Size: formatNVMeBytes(dev.PhysicalSize),
|
||||||
|
CurrentMode: -1,
|
||||||
|
}
|
||||||
|
modes, parseErr := readNVMeFormatModes(ctx, path)
|
||||||
|
if parseErr != nil {
|
||||||
|
disk.Error = parseErr.Error()
|
||||||
|
}
|
||||||
|
disk.Modes = modes
|
||||||
|
for _, mode := range modes {
|
||||||
|
if mode.InUse {
|
||||||
|
disk.CurrentMode = mode.Mode
|
||||||
|
disk.CurrentFormat = formatNVMeBlock(mode.DataBytes, mode.MetadataBytes)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
disks = append(disks, disk)
|
||||||
|
}
|
||||||
|
sort.Slice(disks, func(i, j int) bool { return disks[i].Device < disks[j].Device })
|
||||||
|
return disks, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readNVMeFormatModes(ctx context.Context, device string) ([]nvmeFormatMode, error) {
|
||||||
|
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||||
|
return nil, fmt.Errorf("invalid NVMe device")
|
||||||
|
}
|
||||||
|
out, err := nvmeCommandContext(ctx, "nvme", "id-ns", device, "-H").CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
msg := strings.TrimSpace(string(out))
|
||||||
|
if msg == "" {
|
||||||
|
msg = err.Error()
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("%s", msg)
|
||||||
|
}
|
||||||
|
modes := parseNVMeFormatModes(string(out))
|
||||||
|
if len(modes) == 0 {
|
||||||
|
return nil, fmt.Errorf("no LBA format modes found")
|
||||||
|
}
|
||||||
|
return modes, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseNVMeFormatModes(raw string) []nvmeFormatMode {
|
||||||
|
byMode := map[int]nvmeFormatMode{}
|
||||||
|
for _, m := range nvmeLBAFCompactLineRE.FindAllStringSubmatch(raw, -1) {
|
||||||
|
mode, errMode := strconv.Atoi(m[1])
|
||||||
|
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||||
|
lbads, errLBADS := strconv.Atoi(m[3])
|
||||||
|
if errMode != nil || errMS != nil || errLBADS != nil || lbads < 0 || lbads >= 63 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
data := int64(1) << lbads
|
||||||
|
line := m[0]
|
||||||
|
byMode[mode] = nvmeFormatMode{
|
||||||
|
Mode: mode,
|
||||||
|
DataBytes: data,
|
||||||
|
MetadataBytes: metadata,
|
||||||
|
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||||
|
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, m := range nvmeLBAFVerboseLineRE.FindAllStringSubmatch(raw, -1) {
|
||||||
|
mode, errMode := strconv.Atoi(m[1])
|
||||||
|
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||||
|
data, errData := strconv.ParseInt(m[3], 10, 64)
|
||||||
|
if errMode != nil || errMS != nil || errData != nil || data <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
line := m[0]
|
||||||
|
byMode[mode] = nvmeFormatMode{
|
||||||
|
Mode: mode,
|
||||||
|
DataBytes: data,
|
||||||
|
MetadataBytes: metadata,
|
||||||
|
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||||
|
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
modes := make([]nvmeFormatMode, 0, len(byMode))
|
||||||
|
for _, mode := range byMode {
|
||||||
|
modes = append(modes, mode)
|
||||||
|
}
|
||||||
|
sort.Slice(modes, func(i, j int) bool { return modes[i].Mode < modes[j].Mode })
|
||||||
|
return modes
|
||||||
|
}
|
||||||
|
|
||||||
|
func runNVMeFormatTask(ctx context.Context, j *jobState, device string, lbaf int) error {
|
||||||
|
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||||
|
return fmt.Errorf("invalid NVMe device")
|
||||||
|
}
|
||||||
|
modes, err := readNVMeFormatModes(ctx, device)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
var selected nvmeFormatMode
|
||||||
|
found := false
|
||||||
|
for _, mode := range modes {
|
||||||
|
if mode.Mode == lbaf {
|
||||||
|
selected = mode
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
return fmt.Errorf("MODE %d is not available on %s", lbaf, device)
|
||||||
|
}
|
||||||
|
ms := 0
|
||||||
|
if selected.MetadataBytes > 0 {
|
||||||
|
ms = 1
|
||||||
|
}
|
||||||
|
j.append(fmt.Sprintf("Formatting %s to %s with --lbaf=%d --ms=%d --force", device, formatNVMeBlock(selected.DataBytes, selected.MetadataBytes), selected.Mode, ms))
|
||||||
|
cmd := nvmeCommandContext(ctx, "nvme", "format", device, fmt.Sprintf("--lbaf=%d", selected.Mode), fmt.Sprintf("--ms=%d", ms), "--force")
|
||||||
|
return streamCmdJob(j, cmd)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPINVMeFormats(w http.ResponseWriter, r *http.Request) {
|
||||||
|
disks, err := listNVMeFormatDisks(r.Context())
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, disks)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPINVMeFormatRun(w http.ResponseWriter, r *http.Request) {
|
||||||
|
var req struct {
|
||||||
|
Device string `json:"device"`
|
||||||
|
LBAF int `json:"lbaf"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !nvmeFormatDeviceRE.MatchString(req.Device) {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid NVMe device")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
disks, err := listNVMeFormatDisks(r.Context())
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var label string
|
||||||
|
allowed := false
|
||||||
|
for _, disk := range disks {
|
||||||
|
if disk.Device != req.Device {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, mode := range disk.Modes {
|
||||||
|
if mode.Mode == req.LBAF {
|
||||||
|
allowed = true
|
||||||
|
label = mode.Label
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !allowed {
|
||||||
|
writeError(w, http.StatusBadRequest, "LBA format mode is not available for this device")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
name := fmt.Sprintf("NVMe Format %s to %s", filepath.Base(req.Device), label)
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID("nvme-format"),
|
||||||
|
Name: name,
|
||||||
|
Target: "nvme-format",
|
||||||
|
Priority: defaultTaskPriority("nvme-format", taskParams{}),
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{
|
||||||
|
Device: req.Device,
|
||||||
|
LBAF: req.LBAF,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatNVMeBlock(dataBytes, metadataBytes int64) string {
|
||||||
|
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatNVMeBytes(n int64) string {
|
||||||
|
if n <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||||
|
v := float64(n)
|
||||||
|
unit := 0
|
||||||
|
for v >= 1000 && unit < len(units)-1 {
|
||||||
|
v /= 1000
|
||||||
|
unit++
|
||||||
|
}
|
||||||
|
if unit == 0 {
|
||||||
|
return fmt.Sprintf("%d B", n)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%.1f %s", v, units[unit])
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderNVMeFormatInline() string {
|
||||||
|
return `<div id="nvme-format-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVMe disks...</div>
|
||||||
|
<div id="nvme-format-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
|
<script>
|
||||||
|
function nvmeFormatEsc(s) {
|
||||||
|
return String(s == null ? '' : s).replace(/[&<>"']/g, function(c) {
|
||||||
|
return {'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function loadNVMeFormats() {
|
||||||
|
var status = document.getElementById('nvme-format-status');
|
||||||
|
var table = document.getElementById('nvme-format-table');
|
||||||
|
status.textContent = 'Loading NVMe disks...';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||||
|
fetch('/api/tools/nvme-formats').then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(disks) {
|
||||||
|
window._nvmeFormatDisks = Array.isArray(disks) ? disks : [];
|
||||||
|
if (!window._nvmeFormatDisks.length) {
|
||||||
|
status.textContent = 'No NVMe disks found.';
|
||||||
|
table.innerHTML = '';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
status.textContent = window._nvmeFormatDisks.length + ' NVMe disk(s) found.';
|
||||||
|
var rows = window._nvmeFormatDisks.map(function(d, idx) {
|
||||||
|
var current = d.current_format ? (d.current_format + ' / MODE ' + d.current_mode) : 'unknown';
|
||||||
|
var detail = [d.model || '', d.serial || '', d.size || ''].filter(Boolean).join(' | ');
|
||||||
|
var options = (d.modes || []).map(function(m) {
|
||||||
|
return '<option value="' + m.mode + '"' + (m.in_use ? ' selected' : '') + '>' + nvmeFormatEsc(m.label) + '</option>';
|
||||||
|
}).join('');
|
||||||
|
var disabled = options ? '' : ' disabled';
|
||||||
|
var err = d.error ? '<div style="font-size:12px;color:var(--crit-fg,#9f3a38);margin-top:4px">' + nvmeFormatEsc(d.error) + '</div>' : '';
|
||||||
|
return '<tr>'
|
||||||
|
+ '<td style="font-family:monospace;white-space:nowrap">' + nvmeFormatEsc(d.device) + (detail ? '<div style="font-family:inherit;font-size:12px;color:var(--muted)">' + nvmeFormatEsc(detail) + '</div>' : '') + '</td>'
|
||||||
|
+ '<td style="white-space:nowrap">' + nvmeFormatEsc(current) + err + '</td>'
|
||||||
|
+ '<td style="white-space:nowrap"><select id="nvme-format-select-' + idx + '"' + disabled + '>' + options + '</select></td>'
|
||||||
|
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-primary" onclick="nvmeFormatRun(' + idx + ', this)"' + disabled + '>Apply</button><div class="nvme-format-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div></td>'
|
||||||
|
+ '</tr>';
|
||||||
|
}).join('');
|
||||||
|
table.innerHTML = '<table><tr><th>Disk</th><th>Current block / mode</th><th>New mode</th><th>Action</th></tr>' + rows + '</table>';
|
||||||
|
}).catch(function(e) {
|
||||||
|
status.textContent = 'Error loading NVMe disks: ' + e.message;
|
||||||
|
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||||
|
table.innerHTML = '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function nvmeWaitTaskDone(taskID, rowMsg) {
|
||||||
|
var timer = setInterval(function() {
|
||||||
|
fetch('/api/tasks').then(function(r) { return r.json(); }).then(function(tasks) {
|
||||||
|
var task = (tasks || []).find(function(t) { return t.id === taskID; });
|
||||||
|
if (!task) return;
|
||||||
|
if (task.status === 'done' || task.status === 'failed' || task.status === 'cancelled') {
|
||||||
|
clearInterval(timer);
|
||||||
|
rowMsg.textContent = 'Task ' + taskID + ': ' + task.status + (task.error ? ' - ' + task.error : '');
|
||||||
|
rowMsg.style.color = task.status === 'done' ? 'var(--ok,green)' : 'var(--crit-fg,#9f3a38)';
|
||||||
|
loadNVMeFormats();
|
||||||
|
}
|
||||||
|
}).catch(function(){});
|
||||||
|
}, 1500);
|
||||||
|
}
|
||||||
|
function nvmeFormatRun(idx, btn) {
|
||||||
|
var disk = (window._nvmeFormatDisks || [])[idx];
|
||||||
|
var select = document.getElementById('nvme-format-select-' + idx);
|
||||||
|
var row = btn.closest('td');
|
||||||
|
var rowMsg = row.querySelector('.nvme-format-row-msg');
|
||||||
|
if (!disk || !select) return;
|
||||||
|
var lbaf = parseInt(select.value, 10);
|
||||||
|
var mode = (disk.modes || []).find(function(m) { return m.mode === lbaf; });
|
||||||
|
if (!mode) return;
|
||||||
|
if (!window.confirm('Format ' + disk.device + ' to ' + mode.label + '? This erases data on the namespace.')) return;
|
||||||
|
btn.disabled = true;
|
||||||
|
rowMsg.style.color = 'var(--muted)';
|
||||||
|
rowMsg.textContent = 'Queued...';
|
||||||
|
fetch('/api/tools/nvme-format/run', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body:JSON.stringify({device: disk.device, lbaf: lbaf})
|
||||||
|
}).then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(d) {
|
||||||
|
rowMsg.textContent = 'Task ' + d.task_id + ' queued.';
|
||||||
|
nvmeWaitTaskDone(d.task_id, rowMsg);
|
||||||
|
}).catch(function(e) {
|
||||||
|
rowMsg.style.color = 'var(--crit-fg,#9f3a38)';
|
||||||
|
rowMsg.textContent = 'Error: ' + e.message;
|
||||||
|
}).finally(function() {
|
||||||
|
btn.disabled = false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadNVMeFormats();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderNVMeFormatCard() string {
|
||||||
|
return `<div class="card"><div class="card-head">NVMe Block Format <button class="btn btn-sm btn-secondary" onclick="loadNVMeFormats()" style="margin-left:auto">↻ Refresh</button></div><div class="card-body">` +
|
||||||
|
`<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Lists NVMe namespaces and changes their LBA format through a queued task.</p>` +
|
||||||
|
renderNVMeFormatInline() + `</div></div>`
|
||||||
|
}
|
||||||
@@ -102,47 +102,69 @@ window.supportBundleDownload = function() {
|
|||||||
|
|
||||||
func renderUSBExportCard() string {
|
func renderUSBExportCard() string {
|
||||||
return `<div class="card" style="margin-top:16px">
|
return `<div class="card" style="margin-top:16px">
|
||||||
<div class="card-head">Export to USB
|
<div class="card-head">USB Black-Box
|
||||||
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
<button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="card-body">` + renderUSBExportInline() + `</div>
|
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||||
</div>`
|
</div>`
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderUSBExportInline() string {
|
func renderUSBExportInline() string {
|
||||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
|
||||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||||
|
<div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
|
||||||
<div id="usb-targets" style="margin-top:12px"></div>
|
<div id="usb-targets" style="margin-top:12px"></div>
|
||||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||||
<script>
|
<script>
|
||||||
(function(){
|
(function(){
|
||||||
function usbRefresh() {
|
function blackboxRefresh() {
|
||||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||||
|
document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
|
||||||
document.getElementById('usb-targets').innerHTML = '';
|
document.getElementById('usb-targets').innerHTML = '';
|
||||||
document.getElementById('usb-msg').textContent = '';
|
document.getElementById('usb-msg').textContent = '';
|
||||||
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
Promise.all([
|
||||||
window._usbTargets = Array.isArray(targets) ? targets : [];
|
fetch('/api/export/usb').then(r=>r.json()),
|
||||||
|
fetch('/api/blackbox/status').then(r=>r.json())
|
||||||
|
]).then(function(values) {
|
||||||
|
const targets = Array.isArray(values[0]) ? values[0] : [];
|
||||||
|
const state = values[1] || {};
|
||||||
|
const active = Array.isArray(state.targets) ? state.targets : [];
|
||||||
|
window._usbTargets = targets;
|
||||||
|
window._blackboxTargets = active;
|
||||||
const st = document.getElementById('usb-status');
|
const st = document.getElementById('usb-status');
|
||||||
const ct = document.getElementById('usb-targets');
|
const ct = document.getElementById('usb-targets');
|
||||||
|
const summary = document.getElementById('blackbox-summary');
|
||||||
|
if (state.boot_folder) {
|
||||||
|
summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
|
||||||
|
} else {
|
||||||
|
summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
|
||||||
|
}
|
||||||
if (!targets || targets.length === 0) {
|
if (!targets || targets.length === 0) {
|
||||||
st.textContent = 'No removable USB devices found.';
|
st.textContent = 'No removable USB devices found.';
|
||||||
return;
|
} else {
|
||||||
|
st.textContent = targets.length + ' device(s) found:';
|
||||||
}
|
}
|
||||||
st.textContent = targets.length + ' device(s) found:';
|
const byDevice = {};
|
||||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
active.forEach(function(item) { byDevice[item.device] = item; });
|
||||||
|
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
|
||||||
targets.map((t, idx) => {
|
targets.map((t, idx) => {
|
||||||
const dev = t.device || '';
|
const dev = t.device || '';
|
||||||
const label = t.label || '';
|
const label = t.label || '';
|
||||||
const model = t.model || '';
|
const model = t.model || '';
|
||||||
|
const state = byDevice[dev];
|
||||||
|
const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
|
||||||
|
const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
|
||||||
return '<tr>' +
|
return '<tr>' +
|
||||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||||
'<td>'+t.fs_type+'</td>' +
|
'<td>'+t.fs_type+'</td>' +
|
||||||
'<td>'+t.size+'</td>' +
|
'<td>'+t.size+'</td>' +
|
||||||
'<td>'+label+'</td>' +
|
'<td>'+label+'</td>' +
|
||||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||||
|
'<td style="font-size:12px">'+status+detail+'</td>' +
|
||||||
'<td style="white-space:nowrap">' +
|
'<td style="white-space:nowrap">' +
|
||||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+idx+',this)">Audit JSON</button> ' +
|
(state
|
||||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+idx+',this)">Support Bundle</button>' +
|
? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
|
||||||
|
: '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
|
||||||
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||||
'</td></tr>';
|
'</td></tr>';
|
||||||
}).join('') + '</table>';
|
}).join('') + '</table>';
|
||||||
@@ -150,7 +172,7 @@ function usbRefresh() {
|
|||||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
window.usbExport = function(type, targetIndex, btn) {
|
window.blackboxEnable = function(targetIndex, btn) {
|
||||||
const target = (window._usbTargets || [])[targetIndex];
|
const target = (window._usbTargets || [])[targetIndex];
|
||||||
if (!target) {
|
if (!target) {
|
||||||
const msg = document.getElementById('usb-msg');
|
const msg = document.getElementById('usb-msg');
|
||||||
@@ -164,15 +186,15 @@ window.usbExport = function(type, targetIndex, btn) {
|
|||||||
const originalText = btn ? btn.textContent : '';
|
const originalText = btn ? btn.textContent : '';
|
||||||
if (btn) {
|
if (btn) {
|
||||||
btn.disabled = true;
|
btn.disabled = true;
|
||||||
btn.textContent = 'Exporting...';
|
btn.textContent = 'Enabling...';
|
||||||
}
|
}
|
||||||
if (rowMsg) {
|
if (rowMsg) {
|
||||||
rowMsg.style.color = 'var(--muted)';
|
rowMsg.style.color = 'var(--muted)';
|
||||||
rowMsg.textContent = 'Working...';
|
rowMsg.textContent = 'Working...';
|
||||||
}
|
}
|
||||||
msg.style.color = 'var(--muted)';
|
msg.style.color = 'var(--muted)';
|
||||||
msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
|
msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
|
||||||
fetch('/api/export/usb/'+type, {
|
fetch('/api/blackbox/enable', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {'Content-Type':'application/json'},
|
headers: {'Content-Type':'application/json'},
|
||||||
body: JSON.stringify(target)
|
body: JSON.stringify(target)
|
||||||
@@ -199,10 +221,64 @@ window.usbExport = function(type, targetIndex, btn) {
|
|||||||
btn.disabled = false;
|
btn.disabled = false;
|
||||||
btn.textContent = originalText;
|
btn.textContent = originalText;
|
||||||
}
|
}
|
||||||
|
setTimeout(blackboxRefresh, 300);
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
window.usbRefresh = usbRefresh;
|
window.blackboxDisable = function(targetIndex, btn) {
|
||||||
usbRefresh();
|
const target = (window._usbTargets || [])[targetIndex];
|
||||||
|
const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
|
||||||
|
if (!target || !active) {
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: black-box target not found. Refresh and try again.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
const row = btn ? btn.closest('td') : null;
|
||||||
|
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||||
|
const originalText = btn ? btn.textContent : '';
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Disabling...';
|
||||||
|
}
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--muted)';
|
||||||
|
rowMsg.textContent = 'Working...';
|
||||||
|
}
|
||||||
|
msg.style.color = 'var(--muted)';
|
||||||
|
msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
|
||||||
|
fetch('/api/blackbox/disable', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
|
||||||
|
}).then(async r => {
|
||||||
|
const d = await r.json();
|
||||||
|
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||||
|
return d;
|
||||||
|
}).then(d => {
|
||||||
|
msg.style.color = 'var(--ok,green)';
|
||||||
|
msg.textContent = d.message || 'Done.';
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--ok,green)';
|
||||||
|
rowMsg.textContent = d.message || 'Done.';
|
||||||
|
}
|
||||||
|
}).catch(e => {
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: '+e;
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--err,red)';
|
||||||
|
rowMsg.textContent = 'Error: ' + e;
|
||||||
|
}
|
||||||
|
}).finally(() => {
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = originalText;
|
||||||
|
}
|
||||||
|
setTimeout(blackboxRefresh, 300);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
window.blackboxRefresh = blackboxRefresh;
|
||||||
|
blackboxRefresh();
|
||||||
})();
|
})();
|
||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
@@ -382,7 +458,7 @@ function installToRAM() {
|
|||||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
` + renderSupportBundleInline() + `
|
` + renderSupportBundleInline() + `
|
||||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||||
<div style="font-weight:600;margin-bottom:8px">Export to USB</div>
|
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||||
` + renderUSBExportInline() + `
|
` + renderUSBExportInline() + `
|
||||||
</div>
|
</div>
|
||||||
</div></div>
|
</div></div>
|
||||||
@@ -399,6 +475,7 @@ function installToRAM() {
|
|||||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||||
renderServicesInline() + `</div></div>
|
renderServicesInline() + `</div></div>
|
||||||
|
|
||||||
|
` + renderNVMeFormatCard() + `
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
function checkTools() {
|
function checkTools() {
|
||||||
|
|||||||
@@ -207,7 +207,7 @@ func renderInstall() string {
|
|||||||
func renderTasks() string {
|
func renderTasks() string {
|
||||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
|
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
|
||||||
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||||
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
||||||
</div>
|
</div>
|
||||||
@@ -289,7 +289,7 @@ function cancelAll() {
|
|||||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||||
}
|
}
|
||||||
function killWorkers() {
|
function killWorkers() {
|
||||||
if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
|
if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
|
||||||
fetch('/api/tasks/kill-workers',{method:'POST'})
|
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||||
.then(r=>r.json())
|
.then(r=>r.json())
|
||||||
.then(d=>{
|
.then(d=>{
|
||||||
|
|||||||
@@ -35,9 +35,11 @@ func validateTotalValidateSec(n int) int {
|
|||||||
}
|
}
|
||||||
total := platform.SATEstimatedCPUValidateSec +
|
total := platform.SATEstimatedCPUValidateSec +
|
||||||
platform.SATEstimatedMemoryValidateSec +
|
platform.SATEstimatedMemoryValidateSec +
|
||||||
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
|
||||||
platform.SATEstimatedNvidiaInterconnectSec +
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
platform.SATEstimatedNvidiaBandwidthSec
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
if n > 0 {
|
||||||
|
total += platform.SATEstimatedNvidiaGPUValidateSec
|
||||||
|
}
|
||||||
return total
|
return total
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -47,12 +49,14 @@ func validateTotalStressSec(n int) int {
|
|||||||
}
|
}
|
||||||
total := platform.SATEstimatedCPUStressSec +
|
total := platform.SATEstimatedCPUStressSec +
|
||||||
platform.SATEstimatedMemoryStressSec +
|
platform.SATEstimatedMemoryStressSec +
|
||||||
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
|
||||||
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
|
||||||
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
|
||||||
platform.SATEstimatedNvidiaPulseTestSec +
|
platform.SATEstimatedNvidiaPulseTestSec +
|
||||||
platform.SATEstimatedNvidiaInterconnectSec +
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
platform.SATEstimatedNvidiaBandwidthSec
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
if n > 0 {
|
||||||
|
total += platform.SATEstimatedNvidiaGPUStressSec +
|
||||||
|
platform.SATEstimatedNvidiaTargetedStressSec +
|
||||||
|
platform.SATEstimatedNvidiaTargetedPowerSec
|
||||||
|
}
|
||||||
return total
|
return total
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -128,33 +132,16 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
func() string {
|
fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
|
||||||
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||||
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||||
if n > 0 {
|
|
||||||
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
|
||||||
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
|
||||||
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
|
||||||
validateFmtDur(perV), validateFmtDur(perS))
|
|
||||||
}(),
|
|
||||||
)) +
|
)) +
|
||||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
func() string {
|
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
|
||||||
s := "Skipped in Validate. "
|
|
||||||
if n > 0 {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
|
||||||
} else {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
|
||||||
}
|
|
||||||
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
|
||||||
}(),
|
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-targeted-power">` +
|
`<div id="sat-card-nvidia-targeted-power">` +
|
||||||
@@ -162,16 +149,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
`<code>dcgmi diag targeted_power</code>`,
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
func() string {
|
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
|
||||||
s := "Skipped in Validate. "
|
|
||||||
if n > 0 {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
|
||||||
} else {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
|
||||||
}
|
|
||||||
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
|
||||||
}(),
|
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-pulse">` +
|
`<div id="sat-card-nvidia-pulse">` +
|
||||||
@@ -382,8 +360,8 @@ function runSATWithOverrides(target, overrides) {
|
|||||||
return enqueueSATTarget(target, overrides)
|
return enqueueSATTarget(target, overrides)
|
||||||
.then(d => streamSATTask(d.task_id, title, false));
|
.then(d => streamSATTask(d.task_id, title, false));
|
||||||
}
|
}
|
||||||
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
|
const nvidiaPerGPUTargets = [];
|
||||||
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||||
function satAllGPUIndicesForMulti() {
|
function satAllGPUIndicesForMulti() {
|
||||||
return Promise.resolve(satSelectedGPUIndices());
|
return Promise.resolve(satSelectedGPUIndices());
|
||||||
}
|
}
|
||||||
@@ -417,40 +395,9 @@ function runNvidiaFabricValidate(target) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
function runNvidiaValidateSet(target) {
|
function runNvidiaValidateSet(target) {
|
||||||
return loadSatNvidiaGPUs().then(gpus => {
|
const selected = satSelectedGPUIndices();
|
||||||
const selected = satSelectedGPUIndices();
|
if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||||
const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
|
return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
|
||||||
if (!picked.length) {
|
|
||||||
throw new Error('Select at least one NVIDIA GPU.');
|
|
||||||
}
|
|
||||||
if (picked.length === 1) {
|
|
||||||
const gpu = picked[0];
|
|
||||||
return runSATWithOverrides(target, {
|
|
||||||
gpu_indices: [Number(gpu.index)],
|
|
||||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
|
|
||||||
});
|
|
||||||
}
|
|
||||||
document.getElementById('sat-output').style.display='block';
|
|
||||||
document.getElementById('sat-title').textContent = '— ' + target;
|
|
||||||
const term = document.getElementById('sat-terminal');
|
|
||||||
term.textContent = 'Running ' + target + ' one GPU at a time...\n';
|
|
||||||
const labelBase = satLabels()[target] || ('Validate ' + target);
|
|
||||||
const runNext = (idx) => {
|
|
||||||
if (idx >= picked.length) return Promise.resolve();
|
|
||||||
const gpu = picked[idx];
|
|
||||||
const gpuLabel = satGPUDisplayName(gpu);
|
|
||||||
term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
|
|
||||||
return enqueueSATTarget(target, {
|
|
||||||
gpu_indices: [Number(gpu.index)],
|
|
||||||
display_name: labelBase + ' (' + gpuLabel + ')',
|
|
||||||
}).then(d => {
|
|
||||||
return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
|
|
||||||
}).then(function() {
|
|
||||||
return runNext(idx + 1);
|
|
||||||
});
|
|
||||||
};
|
|
||||||
return runNext(0);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
function runAMDValidateSet() {
|
function runAMDValidateSet() {
|
||||||
const targets = selectedAMDValidateTargets();
|
const targets = selectedAMDValidateTargets();
|
||||||
|
|||||||
@@ -301,11 +301,14 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// Export
|
// Export
|
||||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||||
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
||||||
mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
|
mux.HandleFunc("GET /api/blackbox/status", h.handleAPIBlackboxStatus)
|
||||||
mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
|
mux.HandleFunc("POST /api/blackbox/enable", h.handleAPIBlackboxEnable)
|
||||||
|
mux.HandleFunc("POST /api/blackbox/disable", h.handleAPIBlackboxDisable)
|
||||||
|
|
||||||
// Tools
|
// Tools
|
||||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||||
|
mux.HandleFunc("GET /api/tools/nvme-formats", h.handleAPINVMeFormats)
|
||||||
|
mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
|
||||||
|
|
||||||
// GPU presence / tools
|
// GPU presence / tools
|
||||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||||
@@ -571,6 +574,7 @@ func (h *handler) handleExportIndex(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) {
|
||||||
snapshot, _ := loadSnapshot(h.opts.AuditPath)
|
snapshot, _ := loadSnapshot(h.opts.AuditPath)
|
||||||
|
snapshot = enrichSnapshotForViewer(snapshot)
|
||||||
body, err := viewer.RenderHTML(snapshot, h.opts.Title)
|
body, err := viewer.RenderHTML(snapshot, h.opts.Title)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
|||||||
@@ -671,11 +671,17 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
|||||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||||
t.Fatalf("tools page missing boot source field: %s", body)
|
t.Fatalf("tools page missing boot source field: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `Export to USB`) {
|
if !strings.Contains(body, `USB Black-Box`) {
|
||||||
t.Fatalf("tools page missing export to usb section: %s", body)
|
t.Fatalf("tools page missing usb black-box section: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `Support Bundle</button>`) {
|
if !strings.Contains(body, `/api/blackbox/status`) {
|
||||||
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
t.Fatalf("tools page missing black-box status api usage: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `NVMe Block Format`) {
|
||||||
|
t.Fatalf("tools page missing nvme block format section: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/tools/nvme-formats`) || !strings.Contains(body, `/api/tools/nvme-format/run`) {
|
||||||
|
t.Fatalf("tools page missing nvme format api usage: %s", body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1016,6 +1022,39 @@ func TestViewerRendersLatestSnapshot(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestViewerRendersDerivedStorageBlockFormat(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
body := `{
|
||||||
|
"collected_at":"2026-04-29T00:05:00Z",
|
||||||
|
"hardware":{
|
||||||
|
"board":{"serial_number":"SERIAL-NEW"},
|
||||||
|
"storage":[
|
||||||
|
{
|
||||||
|
"serial_number":"DISK-1",
|
||||||
|
"model":"Test NVMe",
|
||||||
|
"logical_block_size_bytes":512,
|
||||||
|
"physical_block_size_bytes":4096,
|
||||||
|
"metadata_bytes_per_block":8
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}`
|
||||||
|
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/viewer", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec.Body.String(), "512+8") {
|
||||||
|
t.Fatalf("viewer body missing derived block format: %s", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
@@ -1038,6 +1077,36 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAuditJSONDoesNotInjectDerivedStorageBlockFormat(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
body := `{
|
||||||
|
"hardware":{
|
||||||
|
"board":{"serial_number":"SERIAL-API"},
|
||||||
|
"storage":[
|
||||||
|
{
|
||||||
|
"serial_number":"DISK-1",
|
||||||
|
"logical_block_size_bytes":512,
|
||||||
|
"metadata_bytes_per_block":8
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}`
|
||||||
|
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit.json", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
if strings.Contains(rec.Body.String(), "block_format") {
|
||||||
|
t.Fatalf("audit.json should remain contract-only: %s", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestMissingAuditJSONReturnsNotFound(t *testing.T) {
|
func TestMissingAuditJSONReturnsNotFound(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{AuditPath: "/missing/audit.json"})
|
handler := NewHandler(HandlerOptions{AuditPath: "/missing/audit.json"})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
|
|||||||
511
audit/internal/webui/task_runner.go
Normal file
511
audit/internal/webui/task_runner.go
Normal file
@@ -0,0 +1,511 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/runtimeenv"
|
||||||
|
)
|
||||||
|
|
||||||
|
type taskRunnerState struct {
|
||||||
|
PID int `json:"pid"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskRunnerStatePath(t *Task) string {
|
||||||
|
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return filepath.Join(t.ArtifactsDir, "runner-state.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskRunnerState(t *Task, state taskRunnerState) error {
|
||||||
|
path := taskRunnerStatePath(t)
|
||||||
|
if path == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(state, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tmp := path + ".tmp"
|
||||||
|
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.Rename(tmp, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
|
||||||
|
path := taskRunnerStatePath(t)
|
||||||
|
if path == "" {
|
||||||
|
return taskRunnerState{}, false
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return taskRunnerState{}, false
|
||||||
|
}
|
||||||
|
var state taskRunnerState
|
||||||
|
if err := json.Unmarshal(data, &state); err != nil {
|
||||||
|
return taskRunnerState{}, false
|
||||||
|
}
|
||||||
|
return state, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func processAlive(pid int) bool {
|
||||||
|
if pid <= 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
err := syscall.Kill(pid, 0)
|
||||||
|
return err == nil || err == syscall.EPERM
|
||||||
|
}
|
||||||
|
|
||||||
|
func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
|
||||||
|
now := time.Now()
|
||||||
|
t.DoneAt = &now
|
||||||
|
switch {
|
||||||
|
case cancelled:
|
||||||
|
t.Status = TaskCancelled
|
||||||
|
t.ErrMsg = "aborted"
|
||||||
|
case strings.TrimSpace(errMsg) != "":
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = errMsg
|
||||||
|
default:
|
||||||
|
t.Status = TaskDone
|
||||||
|
t.ErrMsg = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
|
||||||
|
if opts == nil {
|
||||||
|
j.append("ERROR: handler options not configured")
|
||||||
|
j.finish("handler options not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
a := opts.App
|
||||||
|
|
||||||
|
recovered := len(j.lines) > 0
|
||||||
|
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||||
|
if recovered {
|
||||||
|
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
archive string
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
|
||||||
|
switch t.Target {
|
||||||
|
case "nvidia":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
diagLevel := 2
|
||||||
|
if t.params.StressMode {
|
||||||
|
diagLevel = 3
|
||||||
|
}
|
||||||
|
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||||
|
result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
|
||||||
|
if e != nil {
|
||||||
|
err = e
|
||||||
|
} else {
|
||||||
|
archive = result.Body
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||||
|
}
|
||||||
|
case "nvidia-targeted-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if dur <= 0 {
|
||||||
|
dur = 300
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-bench-perf":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
SizeMB: t.params.SizeMB,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
RunNCCL: t.params.RunNCCL,
|
||||||
|
ParallelGPUs: t.params.ParallelGPUs,
|
||||||
|
RampStep: t.params.RampStep,
|
||||||
|
RampTotal: t.params.RampTotal,
|
||||||
|
RampRunID: t.params.RampRunID,
|
||||||
|
}, j.append)
|
||||||
|
case "nvidia-bench-power":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
RampStep: t.params.RampStep,
|
||||||
|
RampTotal: t.params.RampTotal,
|
||||||
|
RampRunID: t.params.RampRunID,
|
||||||
|
}, j.append)
|
||||||
|
case "nvidia-bench-autotune":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
SizeMB: t.params.SizeMB,
|
||||||
|
}, t.params.BenchmarkKind, j.append)
|
||||||
|
case "nvidia-compute":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if planErr != nil {
|
||||||
|
err = planErr
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||||
|
dur = rampPlan.DurationSec
|
||||||
|
}
|
||||||
|
if rampPlan.StaggerSeconds > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
|
||||||
|
case "nvidia-targeted-power":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-pulse":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-bandwidth":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-interconnect":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if planErr != nil {
|
||||||
|
err = planErr
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||||
|
dur = rampPlan.DurationSec
|
||||||
|
}
|
||||||
|
if rampPlan.StaggerSeconds > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||||
|
}
|
||||||
|
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||||
|
DurationSec: dur,
|
||||||
|
Loader: t.params.Loader,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
StaggerSeconds: rampPlan.StaggerSeconds,
|
||||||
|
}, j.append)
|
||||||
|
case "memory":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
|
||||||
|
j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
|
||||||
|
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||||
|
case "storage":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||||
|
case "cpu":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
if dur <= 0 {
|
||||||
|
if t.params.StressMode {
|
||||||
|
dur = 1800
|
||||||
|
} else {
|
||||||
|
dur = 60
|
||||||
|
}
|
||||||
|
}
|
||||||
|
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||||
|
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "amd":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-mem":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-bandwidth":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "memory-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "sat-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "platform-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||||
|
runOpts.Components = t.params.PlatformComponents
|
||||||
|
archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
|
||||||
|
case "audit":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
result, e := a.RunAuditNow(opts.RuntimeMode)
|
||||||
|
if e != nil {
|
||||||
|
err = e
|
||||||
|
} else {
|
||||||
|
for _, line := range splitLines(result.Body) {
|
||||||
|
j.append(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "support-bundle":
|
||||||
|
j.append("Building support bundle...")
|
||||||
|
archive, err = buildSupportBundle(opts.ExportDir)
|
||||||
|
case "install":
|
||||||
|
if strings.TrimSpace(t.params.Device) == "" {
|
||||||
|
err = fmt.Errorf("device is required")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
installLogPath := platform.InstallLogPath(t.params.Device)
|
||||||
|
j.append("Install log: " + installLogPath)
|
||||||
|
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
|
||||||
|
case "install-to-ram":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
err = a.RunInstallToRAM(ctx, j.append)
|
||||||
|
case "nvme-format":
|
||||||
|
if strings.TrimSpace(t.params.Device) == "" {
|
||||||
|
err = fmt.Errorf("device is required")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
|
||||||
|
default:
|
||||||
|
j.append("ERROR: unknown target: " + t.Target)
|
||||||
|
j.finish("unknown target")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if archive != "" {
|
||||||
|
archivePath := app.ExtractArchivePath(archive)
|
||||||
|
if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||||
|
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||||
|
}
|
||||||
|
if opts.App != nil && opts.App.StatusDB != nil {
|
||||||
|
app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
j.append("Aborted.")
|
||||||
|
j.finish("aborted")
|
||||||
|
} else {
|
||||||
|
j.append("ERROR: " + err.Error())
|
||||||
|
j.finish(err.Error())
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if archive != "" {
|
||||||
|
j.append("Archive: " + archive)
|
||||||
|
}
|
||||||
|
j.finish("")
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadPersistedTask(statePath, taskID string) (*Task, error) {
|
||||||
|
data, err := os.ReadFile(statePath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var persisted []persistedTask
|
||||||
|
if err := json.Unmarshal(data, &persisted); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
for _, pt := range persisted {
|
||||||
|
if pt.ID != taskID {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
t := &Task{
|
||||||
|
ID: pt.ID,
|
||||||
|
Name: pt.Name,
|
||||||
|
Target: pt.Target,
|
||||||
|
Priority: pt.Priority,
|
||||||
|
Status: pt.Status,
|
||||||
|
CreatedAt: pt.CreatedAt,
|
||||||
|
StartedAt: pt.StartedAt,
|
||||||
|
DoneAt: pt.DoneAt,
|
||||||
|
ErrMsg: pt.ErrMsg,
|
||||||
|
LogPath: pt.LogPath,
|
||||||
|
ArtifactsDir: pt.ArtifactsDir,
|
||||||
|
ReportJSONPath: pt.ReportJSONPath,
|
||||||
|
ReportHTMLPath: pt.ReportHTMLPath,
|
||||||
|
params: pt.Params,
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(t)
|
||||||
|
return t, nil
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("task %s not found", taskID)
|
||||||
|
}
|
||||||
|
|
||||||
|
func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
|
||||||
|
if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
|
||||||
|
fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("resolve runtime for task-run", "err", err)
|
||||||
|
}
|
||||||
|
opts := &HandlerOptions{
|
||||||
|
ExportDir: exportDir,
|
||||||
|
App: app.New(platform.New()),
|
||||||
|
RuntimeMode: runtimeInfo.Mode,
|
||||||
|
}
|
||||||
|
statePath := filepath.Join(exportDir, "tasks-state.json")
|
||||||
|
task, err := loadPersistedTask(statePath, taskID)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintln(stderr, err.Error())
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
if task.StartedAt == nil || task.StartedAt.IsZero() {
|
||||||
|
now := time.Now()
|
||||||
|
task.StartedAt = &now
|
||||||
|
}
|
||||||
|
if task.Status == "" {
|
||||||
|
task.Status = TaskRunning
|
||||||
|
}
|
||||||
|
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||||
|
PID: os.Getpid(),
|
||||||
|
Status: TaskRunning,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
fmt.Fprintln(stderr, err.Error())
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||||
|
executeTaskWithOptions(opts, task, j, ctx)
|
||||||
|
finalizeTaskForResult(task, j.err, ctx.Err() != nil)
|
||||||
|
if err := writeTaskReportArtifacts(task); err != nil {
|
||||||
|
appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||||
|
}
|
||||||
|
j.closeLog()
|
||||||
|
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||||
|
PID: os.Getpid(),
|
||||||
|
Status: task.Status,
|
||||||
|
Error: task.ErrMsg,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
fmt.Fprintln(stderr, err.Error())
|
||||||
|
}
|
||||||
|
if task.ErrMsg != "" {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
@@ -13,6 +14,7 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
@@ -55,6 +57,7 @@ var taskNames = map[string]string{
|
|||||||
"support-bundle": "Support Bundle",
|
"support-bundle": "Support Bundle",
|
||||||
"install": "Install to Disk",
|
"install": "Install to Disk",
|
||||||
"install-to-ram": "Install to RAM",
|
"install-to-ram": "Install to RAM",
|
||||||
|
"nvme-format": "NVMe Block Format Change",
|
||||||
}
|
}
|
||||||
|
|
||||||
// burnNames maps target → human-readable name when a burn profile is set.
|
// burnNames maps target → human-readable name when a burn profile is set.
|
||||||
@@ -110,8 +113,9 @@ type Task struct {
|
|||||||
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
||||||
|
|
||||||
// runtime fields (not serialised)
|
// runtime fields (not serialised)
|
||||||
job *jobState
|
job *jobState
|
||||||
params taskParams
|
runnerPID int
|
||||||
|
params taskParams
|
||||||
}
|
}
|
||||||
|
|
||||||
// taskParams holds optional parameters parsed from the run request.
|
// taskParams holds optional parameters parsed from the run request.
|
||||||
@@ -134,6 +138,7 @@ type taskParams struct {
|
|||||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
Device string `json:"device,omitempty"` // for install
|
||||||
|
LBAF int `json:"lbaf,omitempty"`
|
||||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -328,6 +333,13 @@ var (
|
|||||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||||
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
||||||
}
|
}
|
||||||
|
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
|
||||||
|
exe, err := os.Executable()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return exec.Command(exe, "bee-worker", "--export-dir", exportDir, "--task-id", taskID), nil
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
// enqueue adds a task to the queue and notifies the worker.
|
// enqueue adds a task to the queue and notifies the worker.
|
||||||
@@ -365,6 +377,11 @@ func (q *taskQueue) prune() {
|
|||||||
|
|
||||||
// nextPending returns the highest-priority pending task (nil if none).
|
// nextPending returns the highest-priority pending task (nil if none).
|
||||||
func (q *taskQueue) nextPending() *Task {
|
func (q *taskQueue) nextPending() *Task {
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
if t.Status == TaskRunning {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
var best *Task
|
var best *Task
|
||||||
for _, t := range q.tasks {
|
for _, t := range q.tasks {
|
||||||
if t.Status != TaskPending {
|
if t.Status != TaskPending {
|
||||||
@@ -484,6 +501,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
|||||||
if !q.started {
|
if !q.started {
|
||||||
q.loadLocked()
|
q.loadLocked()
|
||||||
q.started = true
|
q.started = true
|
||||||
|
q.resumeRunningTasksLocked()
|
||||||
goRecoverLoop("task worker", 2*time.Second, q.worker)
|
goRecoverLoop("task worker", 2*time.Second, q.worker)
|
||||||
}
|
}
|
||||||
hasPending := q.nextPending() != nil
|
hasPending := q.nextPending() != nil
|
||||||
@@ -517,15 +535,12 @@ func (q *taskQueue) worker() {
|
|||||||
t.StartedAt = &now
|
t.StartedAt = &now
|
||||||
t.DoneAt = nil
|
t.DoneAt = nil
|
||||||
t.ErrMsg = ""
|
t.ErrMsg = ""
|
||||||
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
j := newTaskJobState(t.LogPath)
|
||||||
t.job = j
|
t.job = j
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
|
|
||||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
q.runTaskExternal(t, j)
|
||||||
j.cancel = taskCancel
|
|
||||||
q.executeTask(t, j, taskCtx)
|
|
||||||
taskCancel()
|
|
||||||
|
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
q.prune()
|
q.prune()
|
||||||
@@ -537,6 +552,218 @@ func (q *taskQueue) worker() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) resumeRunningTasksLocked() {
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
if t.Status != TaskRunning {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if t.job == nil {
|
||||||
|
t.job = newTaskJobState(t.LogPath)
|
||||||
|
}
|
||||||
|
q.attachExternalTaskControlsLocked(t, t.job)
|
||||||
|
q.startRecoveredTaskMonitorLocked(t, t.job)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) attachExternalTaskControlsLocked(t *Task, j *jobState) {
|
||||||
|
if t == nil || j == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
j.cancel = func() {
|
||||||
|
pid := t.runnerPID
|
||||||
|
if pid <= 0 {
|
||||||
|
if state, ok := readTaskRunnerState(t); ok {
|
||||||
|
pid = state.PID
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if pid > 0 {
|
||||||
|
_ = syscall.Kill(pid, syscall.SIGTERM)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
|
||||||
|
if t == nil || j == nil || t.runnerPID <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
goRecoverOnce("task runner monitor", func() {
|
||||||
|
stopTail := make(chan struct{})
|
||||||
|
doneTail := make(chan struct{})
|
||||||
|
go q.followTaskLog(t, j, stopTail, doneTail)
|
||||||
|
for processAlive(t.runnerPID) {
|
||||||
|
time.Sleep(500 * time.Millisecond)
|
||||||
|
}
|
||||||
|
close(stopTail)
|
||||||
|
<-doneTail
|
||||||
|
q.finishExternalTask(t, j, nil)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
|
||||||
|
startedKmsgWatch := false
|
||||||
|
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||||
|
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||||
|
startedKmsgWatch = true
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if startedKmsgWatch && q.kmsgWatcher != nil {
|
||||||
|
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
stopTail := make(chan struct{})
|
||||||
|
doneTail := make(chan struct{})
|
||||||
|
defer func() {
|
||||||
|
close(stopTail)
|
||||||
|
<-doneTail
|
||||||
|
}()
|
||||||
|
go q.followTaskLog(t, j, stopTail, doneTail)
|
||||||
|
|
||||||
|
cmd, err := externalTaskRunnerCommand(q.opts.ExportDir, t.ID)
|
||||||
|
if err != nil {
|
||||||
|
j.appendFromLog("ERROR: " + err.Error())
|
||||||
|
q.finishExternalTask(t, j, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
j.appendFromLog("ERROR: " + err.Error())
|
||||||
|
q.finishExternalTask(t, j, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
q.mu.Lock()
|
||||||
|
t.runnerPID = cmd.Process.Pid
|
||||||
|
q.attachExternalTaskControlsLocked(t, j)
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
|
||||||
|
waitErr := cmd.Wait()
|
||||||
|
time.Sleep(200 * time.Millisecond)
|
||||||
|
q.finishExternalTask(t, j, waitErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) followTaskLog(t *Task, j *jobState, stop <-chan struct{}, done chan<- struct{}) {
|
||||||
|
defer close(done)
|
||||||
|
path := ""
|
||||||
|
if t != nil {
|
||||||
|
path = t.LogPath
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(path) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
offset := int64(0)
|
||||||
|
if info, err := os.Stat(path); err == nil {
|
||||||
|
offset = info.Size()
|
||||||
|
}
|
||||||
|
var partial string
|
||||||
|
ticker := time.NewTicker(250 * time.Millisecond)
|
||||||
|
defer ticker.Stop()
|
||||||
|
flush := func() {
|
||||||
|
data, newOffset, err := readTaskLogDelta(path, offset)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
offset = newOffset
|
||||||
|
return
|
||||||
|
}
|
||||||
|
offset = newOffset
|
||||||
|
text := partial + strings.ReplaceAll(string(data), "\r\n", "\n")
|
||||||
|
lines := strings.Split(text, "\n")
|
||||||
|
partial = lines[len(lines)-1]
|
||||||
|
for _, line := range lines[:len(lines)-1] {
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
j.appendFromLog(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
flush()
|
||||||
|
case <-stop:
|
||||||
|
flush()
|
||||||
|
if strings.TrimSpace(partial) != "" {
|
||||||
|
j.appendFromLog(partial)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func readTaskLogDelta(path string, offset int64) ([]byte, int64, error) {
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, offset, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
info, err := f.Stat()
|
||||||
|
if err != nil {
|
||||||
|
return nil, offset, err
|
||||||
|
}
|
||||||
|
if info.Size() < offset {
|
||||||
|
offset = 0
|
||||||
|
}
|
||||||
|
if _, err := f.Seek(offset, io.SeekStart); err != nil {
|
||||||
|
return nil, offset, err
|
||||||
|
}
|
||||||
|
data, err := io.ReadAll(io.LimitReader(f, 1<<20))
|
||||||
|
return data, offset + int64(len(data)), err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) finishExternalTask(t *Task, j *jobState, waitErr error) {
|
||||||
|
q.mu.Lock()
|
||||||
|
defer q.mu.Unlock()
|
||||||
|
if t.Status == TaskDone || t.Status == TaskFailed || t.Status == TaskCancelled {
|
||||||
|
if j != nil && !j.isDone() {
|
||||||
|
j.finish(t.ErrMsg)
|
||||||
|
j.closeLog()
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case q.trigger <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
state, ok := readTaskRunnerState(t)
|
||||||
|
switch {
|
||||||
|
case ok && state.Status != TaskRunning:
|
||||||
|
t.Status = state.Status
|
||||||
|
t.ErrMsg = state.Error
|
||||||
|
now := state.UpdatedAt
|
||||||
|
if now.IsZero() {
|
||||||
|
now = time.Now()
|
||||||
|
}
|
||||||
|
t.DoneAt = &now
|
||||||
|
case waitErr != nil:
|
||||||
|
now := time.Now()
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = waitErr.Error()
|
||||||
|
t.DoneAt = &now
|
||||||
|
default:
|
||||||
|
now := time.Now()
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = "task runner exited without final state"
|
||||||
|
t.DoneAt = &now
|
||||||
|
}
|
||||||
|
t.runnerPID = 0
|
||||||
|
q.finalizeTaskArtifactPathsLocked(t)
|
||||||
|
q.persistLocked()
|
||||||
|
|
||||||
|
if j != nil && !j.isDone() {
|
||||||
|
j.finish(t.ErrMsg)
|
||||||
|
j.closeLog()
|
||||||
|
}
|
||||||
|
if t.ErrMsg != "" {
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
||||||
|
} else {
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case q.trigger <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
|
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
|
||||||
startedKmsgWatch := false
|
startedKmsgWatch := false
|
||||||
defer q.finalizeTaskRun(t, j)
|
defer q.finalizeTaskRun(t, j)
|
||||||
@@ -985,15 +1212,11 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
|||||||
taskSerialEvent(t, "finished with status="+t.Status)
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job == nil || !t.job.abort() {
|
||||||
t.job.abort()
|
writeError(w, http.StatusConflict, "task is not cancellable")
|
||||||
|
return
|
||||||
}
|
}
|
||||||
t.Status = TaskCancelled
|
writeJSON(w, map[string]string{"status": "aborting"})
|
||||||
now := time.Now()
|
|
||||||
t.DoneAt = &now
|
|
||||||
globalQueue.persistLocked()
|
|
||||||
taskSerialEvent(t, "finished with status="+t.Status)
|
|
||||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
|
||||||
default:
|
default:
|
||||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||||
}
|
}
|
||||||
@@ -1039,12 +1262,6 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
|||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
t.job.abort()
|
t.job.abort()
|
||||||
}
|
}
|
||||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
|
||||||
platform.KillTestWorkers()
|
|
||||||
}
|
|
||||||
t.Status = TaskCancelled
|
|
||||||
t.DoneAt = &now
|
|
||||||
taskSerialEvent(t, "finished with status="+t.Status)
|
|
||||||
n++
|
n++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1175,18 +1392,29 @@ func (q *taskQueue) loadLocked() {
|
|||||||
}
|
}
|
||||||
q.assignTaskLogPathLocked(t)
|
q.assignTaskLogPathLocked(t)
|
||||||
if t.Status == TaskRunning {
|
if t.Status == TaskRunning {
|
||||||
// The task was interrupted by a bee-web restart. Child processes
|
state, ok := readTaskRunnerState(t)
|
||||||
// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
|
switch {
|
||||||
// their own process groups. Kill any matching stale workers before
|
case ok && state.Status == TaskRunning && processAlive(state.PID):
|
||||||
// marking the task failed so the next GPU test does not inherit a
|
t.runnerPID = state.PID
|
||||||
// busy DCGM slot or duplicate workers.
|
t.job = newTaskJobState(t.LogPath)
|
||||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
case ok && state.Status != TaskRunning:
|
||||||
_ = platform.KillTestWorkers()
|
t.runnerPID = state.PID
|
||||||
|
t.Status = state.Status
|
||||||
|
t.ErrMsg = state.Error
|
||||||
|
now := state.UpdatedAt
|
||||||
|
if now.IsZero() {
|
||||||
|
now = time.Now()
|
||||||
|
}
|
||||||
|
t.DoneAt = &now
|
||||||
|
default:
|
||||||
|
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||||
|
_ = platform.KillTestWorkers()
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.DoneAt = &now
|
||||||
|
t.ErrMsg = "interrupted by bee-web restart"
|
||||||
}
|
}
|
||||||
now := time.Now()
|
|
||||||
t.Status = TaskFailed
|
|
||||||
t.DoneAt = &now
|
|
||||||
t.ErrMsg = "interrupted by bee-web restart"
|
|
||||||
} else if t.Status == TaskPending {
|
} else if t.Status == TaskPending {
|
||||||
t.StartedAt = nil
|
t.StartedAt = nil
|
||||||
t.DoneAt = nil
|
t.DoneAt = nil
|
||||||
|
|||||||
@@ -126,6 +126,23 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestJobAppendFlushesTaskLogImmediately(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "task.log")
|
||||||
|
j := newTaskJobState(path)
|
||||||
|
|
||||||
|
j.append("live-line")
|
||||||
|
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if string(data) != "live-line\n" {
|
||||||
|
t.Fatalf("log=%q want live-line newline", string(data))
|
||||||
|
}
|
||||||
|
j.closeLog()
|
||||||
|
}
|
||||||
|
|
||||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||||
q := &taskQueue{
|
q := &taskQueue{
|
||||||
@@ -849,3 +866,82 @@ func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
|
|||||||
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunTaskExternalOpensAndClosesKmsgWindow(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
releasePath := filepath.Join(dir, "release")
|
||||||
|
readyPath := filepath.Join(dir, "ready")
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{ExportDir: dir},
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
kmsgWatcher: newKmsgWatcher(nil),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-external-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
q.assignTaskLogPathLocked(tk)
|
||||||
|
j := newTaskJobState(tk.LogPath)
|
||||||
|
|
||||||
|
orig := externalTaskRunnerCommand
|
||||||
|
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
|
||||||
|
script := "printf ready > \"$1\"; while [ ! -f \"$2\" ]; do sleep 0.05; done"
|
||||||
|
return exec.Command("sh", "-c", script, "sh", readyPath, releasePath), nil
|
||||||
|
}
|
||||||
|
defer func() { externalTaskRunnerCommand = orig }()
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
q.runTaskExternal(tk, j)
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
deadline := time.Now().Add(2 * time.Second)
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
if _, err := os.Stat(readyPath); err == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
time.Sleep(20 * time.Millisecond)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(readyPath); err != nil {
|
||||||
|
t.Fatalf("external runner did not start: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
q.kmsgWatcher.mu.Lock()
|
||||||
|
activeCount := q.kmsgWatcher.activeCount
|
||||||
|
window := q.kmsgWatcher.window
|
||||||
|
q.kmsgWatcher.mu.Unlock()
|
||||||
|
if activeCount != 1 {
|
||||||
|
t.Fatalf("activeCount while running=%d want 1", activeCount)
|
||||||
|
}
|
||||||
|
if window == nil || len(window.targets) != 1 || window.targets[0] != "cpu" {
|
||||||
|
t.Fatalf("window while running=%+v", window)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile(releasePath, []byte("1\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("runTaskExternal did not return")
|
||||||
|
}
|
||||||
|
|
||||||
|
q.kmsgWatcher.mu.Lock()
|
||||||
|
activeCount = q.kmsgWatcher.activeCount
|
||||||
|
window = q.kmsgWatcher.window
|
||||||
|
q.kmsgWatcher.mu.Unlock()
|
||||||
|
if activeCount != 0 {
|
||||||
|
t.Fatalf("activeCount after finish=%d want 0", activeCount)
|
||||||
|
}
|
||||||
|
if window != nil {
|
||||||
|
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
62
audit/internal/webui/viewer_snapshot.go
Normal file
62
audit/internal/webui/viewer_snapshot.go
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"strconv"
|
||||||
|
)
|
||||||
|
|
||||||
|
func enrichSnapshotForViewer(snapshot []byte) []byte {
|
||||||
|
if len(snapshot) == 0 {
|
||||||
|
return snapshot
|
||||||
|
}
|
||||||
|
var root map[string]any
|
||||||
|
if err := json.Unmarshal(snapshot, &root); err != nil {
|
||||||
|
return snapshot
|
||||||
|
}
|
||||||
|
hardware, _ := root["hardware"].(map[string]any)
|
||||||
|
if len(hardware) == 0 {
|
||||||
|
return snapshot
|
||||||
|
}
|
||||||
|
storage, _ := hardware["storage"].([]any)
|
||||||
|
if len(storage) == 0 {
|
||||||
|
return snapshot
|
||||||
|
}
|
||||||
|
changed := false
|
||||||
|
for _, item := range storage {
|
||||||
|
row, _ := item.(map[string]any)
|
||||||
|
if len(row) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, exists := row["block_format"]; exists {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
logical, okLogical := jsonNumberToInt64(row["logical_block_size_bytes"])
|
||||||
|
metadata, okMetadata := jsonNumberToInt64(row["metadata_bytes_per_block"])
|
||||||
|
if !okLogical || !okMetadata || logical <= 0 || metadata < 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
row["block_format"] = strconv.FormatInt(logical, 10) + "+" + strconv.FormatInt(metadata, 10)
|
||||||
|
changed = true
|
||||||
|
}
|
||||||
|
if !changed {
|
||||||
|
return snapshot
|
||||||
|
}
|
||||||
|
out, err := json.Marshal(root)
|
||||||
|
if err != nil {
|
||||||
|
return snapshot
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func jsonNumberToInt64(v any) (int64, bool) {
|
||||||
|
switch x := v.(type) {
|
||||||
|
case float64:
|
||||||
|
return int64(x), true
|
||||||
|
case int64:
|
||||||
|
return x, true
|
||||||
|
case int:
|
||||||
|
return int64(x), true
|
||||||
|
default:
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
}
|
||||||
2
bible
2
bible
Submodule bible updated: 1d89a4918e...d2600f1279
@@ -9,5 +9,62 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
|||||||
|---|---|
|
|---|---|
|
||||||
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
||||||
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
||||||
|
| `docs/customer-gpu-test-methodology.md` | Customer-facing GPU PCIe Validate / Validate -> Stress test list |
|
||||||
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||||
| `decisions/` | Architectural decision log |
|
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
|
||||||
|
| `decisions/` | Architectural decision log, including read-only submodule policy |
|
||||||
|
|
||||||
|
## Validate Test Matrix
|
||||||
|
|
||||||
|
### Validate
|
||||||
|
|
||||||
|
- CPU check
|
||||||
|
- `lscpu`
|
||||||
|
- `sensors`
|
||||||
|
- `stress-ng`
|
||||||
|
- Memory check
|
||||||
|
- `free`
|
||||||
|
- `timeout <timeout_sec> memtester`
|
||||||
|
- `free`
|
||||||
|
- NVMe storage check
|
||||||
|
- `nvme id-ctrl`
|
||||||
|
- `nvme smart-log`
|
||||||
|
- `nvme device-self-test`
|
||||||
|
- SATA/SAS storage check
|
||||||
|
- `smartctl -H -A`
|
||||||
|
- `smartctl -t short`
|
||||||
|
- Basic NVIDIA GPU check
|
||||||
|
- `nvidia-smi -pm 1`
|
||||||
|
- `nvidia-smi -q`
|
||||||
|
- `dmidecode -t baseboard`
|
||||||
|
- `dmidecode -t system`
|
||||||
|
- `dcgmi diag -r 2`
|
||||||
|
- Inter-GPU communication check
|
||||||
|
- `all_reduce_perf`
|
||||||
|
- GPU bandwidth check
|
||||||
|
- `dcgmi diag -r nvbandwidth`
|
||||||
|
|
||||||
|
### Validate -> Stress
|
||||||
|
|
||||||
|
- Extended NVIDIA GPU check
|
||||||
|
- `nvidia-smi -pm 1`
|
||||||
|
- `nvidia-smi -q`
|
||||||
|
- `dmidecode -t baseboard`
|
||||||
|
- `dmidecode -t system`
|
||||||
|
- `dcgmi diag -r 3`
|
||||||
|
- NVIDIA targeted stress
|
||||||
|
- `nvidia-smi -pm 1`
|
||||||
|
- `nvidia-smi -q`
|
||||||
|
- `dcgmi diag -r targeted_stress`
|
||||||
|
- NVIDIA targeted power
|
||||||
|
- `nvidia-smi -pm 1`
|
||||||
|
- `nvidia-smi -q`
|
||||||
|
- `dcgmi diag -r targeted_power`
|
||||||
|
- NVIDIA pulse test
|
||||||
|
- `nvidia-smi -pm 1`
|
||||||
|
- `nvidia-smi -q`
|
||||||
|
- `dcgmi diag -r pulse_test`
|
||||||
|
- Inter-GPU communication check
|
||||||
|
- `all_reduce_perf`
|
||||||
|
- GPU bandwidth check
|
||||||
|
- `dcgmi diag -r nvbandwidth`
|
||||||
|
|||||||
@@ -149,7 +149,6 @@ Current validation state:
|
|||||||
6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
|
6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
|
||||||
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
|
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
|
||||||
8. output JSON → /var/log/bee-audit.json
|
8. output JSON → /var/log/bee-audit.json
|
||||||
9. QR summary to stdout (qrencode if available)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||||
|
|||||||
@@ -58,6 +58,8 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
- `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
|
- `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
|
||||||
- Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
|
- Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
|
||||||
- Contract fields that have no honest local source on a generic Linux host may remain empty.
|
- Contract fields that have no honest local source on a generic Linux host may remain empty.
|
||||||
|
- Embedded submodules such as `internal/chart/` and `bible/` are read-only for `bee` feature work.
|
||||||
|
- If the UI needs extra information, `bee` must emit it through the standard audit JSON contract rather than patching `chart`.
|
||||||
|
|
||||||
## Tech stack
|
## Tech stack
|
||||||
|
|
||||||
@@ -101,7 +103,7 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
| `iso/builder/` | ISO build scripts and `live-build` profile |
|
| `iso/builder/` | ISO build scripts and `live-build` profile |
|
||||||
| `iso/overlay/` | Source overlay copied into a staged build overlay |
|
| `iso/overlay/` | Source overlay copied into a staged build overlay |
|
||||||
| `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, arcconf, ssacli, …) |
|
| `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, arcconf, ssacli, …) |
|
||||||
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
|
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web`; update by submodule pointer only, never by local `bee`-specific edits |
|
||||||
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
|
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
|
||||||
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
||||||
| `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
|
| `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
|
||||||
|
|||||||
@@ -0,0 +1,39 @@
|
|||||||
|
# Decision: Treat embedded submodules as read-only
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
`bee` embeds external git submodules such as:
|
||||||
|
|
||||||
|
- `internal/chart/` — `reanimator/chart`, a generic read-only viewer for Reanimator JSON snapshots
|
||||||
|
- `bible/` — shared engineering rules and contracts
|
||||||
|
|
||||||
|
These repositories are reused by other projects. A local feature request in `bee`
|
||||||
|
must not be solved by silently changing shared submodule behavior.
|
||||||
|
|
||||||
|
The concrete failure mode here was attempting to add project-specific storage
|
||||||
|
telemetry presentation by editing `internal/chart/`. That couples a shared viewer
|
||||||
|
to one host application's needs and creates hidden cross-project regressions.
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
Embedded submodules are read-only from the point of view of `bee`.
|
||||||
|
|
||||||
|
- Do not implement `bee`-specific behavior by editing `internal/chart/`.
|
||||||
|
- Do not implement `bee`-specific behavior by editing `bible/`.
|
||||||
|
- If `bee` needs new data in the report, produce it in the standard audit JSON
|
||||||
|
emitted by `bee` itself.
|
||||||
|
- `chart` must continue to consume the canonical snapshot as an external viewer,
|
||||||
|
without host-specific forks.
|
||||||
|
- Updating a submodule pointer to an upstream commit is allowed.
|
||||||
|
- Carrying local unmerged submodule commits as part of a `bee` feature is forbidden.
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Audit/report features must be expressed through the contract in
|
||||||
|
`bible-local/docs/hardware-ingest-contract.md`.
|
||||||
|
- `bee` owns collection, normalization, and serialization of storage telemetry in
|
||||||
|
`hardware.storage[]`.
|
||||||
|
- `chart` remains a pure visualization module that reads the snapshot it is given.
|
||||||
|
- If a capability is genuinely missing in a shared submodule, it must be proposed
|
||||||
|
and landed upstream as a generic change first, then pulled into `bee` via a
|
||||||
|
normal submodule update.
|
||||||
@@ -6,3 +6,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
|||||||
|---|---|---|
|
|---|---|---|
|
||||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||||
|
| 2026-04-29 | Treat embedded submodules as read-only | active |
|
||||||
|
|||||||
54
bible-local/docs/customer-gpu-test-methodology.md
Normal file
54
bible-local/docs/customer-gpu-test-methodology.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# GPU PCIe Test Methodology
|
||||||
|
|
||||||
|
## Validate
|
||||||
|
|
||||||
|
- CPU check
|
||||||
|
- `lscpu`
|
||||||
|
- `sensors`
|
||||||
|
- `stress-ng`
|
||||||
|
- Memory check
|
||||||
|
- `free`
|
||||||
|
- `timeout <timeout_sec> memtester`
|
||||||
|
- `free`
|
||||||
|
- NVMe storage check
|
||||||
|
- `nvme id-ctrl`
|
||||||
|
- `nvme smart-log`
|
||||||
|
- `nvme device-self-test`
|
||||||
|
- SATA/SAS storage check
|
||||||
|
- `smartctl -H -A`
|
||||||
|
- `smartctl -t short`
|
||||||
|
- Basic NVIDIA GPU check
|
||||||
|
- `nvidia-smi -pm 1`
|
||||||
|
- `nvidia-smi -q`
|
||||||
|
- `dmidecode -t baseboard`
|
||||||
|
- `dmidecode -t system`
|
||||||
|
- `dcgmi diag -r 2`
|
||||||
|
- Inter-GPU communication check
|
||||||
|
- `all_reduce_perf`
|
||||||
|
- GPU bandwidth check
|
||||||
|
- `dcgmi diag -r nvbandwidth`
|
||||||
|
|
||||||
|
## Validate -> Stress
|
||||||
|
|
||||||
|
- Extended NVIDIA GPU check
|
||||||
|
- `nvidia-smi -pm 1`
|
||||||
|
- `nvidia-smi -q`
|
||||||
|
- `dmidecode -t baseboard`
|
||||||
|
- `dmidecode -t system`
|
||||||
|
- `dcgmi diag -r 3`
|
||||||
|
- NVIDIA targeted stress
|
||||||
|
- `nvidia-smi -pm 1`
|
||||||
|
- `nvidia-smi -q`
|
||||||
|
- `dcgmi diag -r targeted_stress`
|
||||||
|
- NVIDIA targeted power
|
||||||
|
- `nvidia-smi -pm 1`
|
||||||
|
- `nvidia-smi -q`
|
||||||
|
- `dcgmi diag -r targeted_power`
|
||||||
|
- NVIDIA pulse test
|
||||||
|
- `nvidia-smi -pm 1`
|
||||||
|
- `nvidia-smi -q`
|
||||||
|
- `dcgmi diag -r pulse_test`
|
||||||
|
- Inter-GPU communication check
|
||||||
|
- `all_reduce_perf`
|
||||||
|
- GPU bandwidth check
|
||||||
|
- `dcgmi diag -r nvbandwidth`
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
---
|
---
|
||||||
title: Hardware Ingest JSON Contract
|
title: Hardware Ingest JSON Contract
|
||||||
version: "2.7"
|
version: "2.10"
|
||||||
updated: "2026-03-15"
|
updated: "2026-04-29"
|
||||||
maintainer: Reanimator Core
|
maintainer: Reanimator Core
|
||||||
audience: external-integrators, ai-agents
|
audience: external-integrators, ai-agents
|
||||||
language: ru
|
language: ru
|
||||||
@@ -9,7 +9,7 @@ language: ru
|
|||||||
|
|
||||||
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
|
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
|
||||||
|
|
||||||
Версия: **2.7** · Дата: **2026-03-15**
|
Версия: **2.10** · Дата: **2026-04-29**
|
||||||
|
|
||||||
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
|
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
|
||||||
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
|
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
|
||||||
@@ -22,6 +22,9 @@ language: ru
|
|||||||
|
|
||||||
| Версия | Дата | Изменения |
|
| Версия | Дата | Изменения |
|
||||||
|--------|------|-----------|
|
|--------|------|-----------|
|
||||||
|
| 2.10 | 2026-04-29 | Для `hardware.storage[]` добавлены необязательные числовые поля `logical_block_size_bytes`, `physical_block_size_bytes`, `metadata_bytes_per_block` для нормализованного описания формата блока накопителя |
|
||||||
|
| 2.9 | 2026-03-19 | Добавлена необязательная секция `hardware.platform_config` — произвольный объект с настройками платформы (BIOS/Redfish); хранится как latest-snapshot per machine |
|
||||||
|
| 2.8 | 2026-03-15 | Поле `location` удалено из всех `sensors.*`; сенсоры передаются только по `name` и измеренным значениям |
|
||||||
| 2.7 | 2026-03-15 | Явно запрещён синтез данных в `event_logs`; интеграторы не должны придумывать серийные номера компонентов, если источник их не отдал |
|
| 2.7 | 2026-03-15 | Явно запрещён синтез данных в `event_logs`; интеграторы не должны придумывать серийные номера компонентов, если источник их не отдал |
|
||||||
| 2.6 | 2026-03-15 | Добавлена необязательная секция `event_logs` для dedup/upsert логов `host` / `bmc` / `redfish` вне history timeline |
|
| 2.6 | 2026-03-15 | Добавлена необязательная секция `event_logs` для dedup/upsert логов `host` / `bmc` / `redfish` вне history timeline |
|
||||||
| 2.5 | 2026-03-15 | Добавлено общее необязательное поле `manufactured_year_week` для компонентных секций (`YYYY-Www`) |
|
| 2.5 | 2026-03-15 | Добавлено общее необязательное поле `manufactured_year_week` для компонентных секций (`YYYY-Www`) |
|
||||||
@@ -131,8 +134,9 @@ GET /ingest/hardware/jobs/{job_id}
|
|||||||
"storage": [ ... ],
|
"storage": [ ... ],
|
||||||
"pcie_devices": [ ... ],
|
"pcie_devices": [ ... ],
|
||||||
"power_supplies": [ ... ],
|
"power_supplies": [ ... ],
|
||||||
"sensors": { ... },
|
"sensors": { ... },
|
||||||
"event_logs": [ ... ]
|
"event_logs": [ ... ],
|
||||||
|
"platform_config": { ... }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
@@ -343,6 +347,9 @@ GET /ingest/hardware/jobs/{job_id}
|
|||||||
| `type` | string | нет | Тип: `NVMe`, `SSD`, `HDD` |
|
| `type` | string | нет | Тип: `NVMe`, `SSD`, `HDD` |
|
||||||
| `interface` | string | нет | Интерфейс: `NVMe`, `SATA`, `SAS` |
|
| `interface` | string | нет | Интерфейс: `NVMe`, `SATA`, `SAS` |
|
||||||
| `size_gb` | int | нет | Размер в ГБ |
|
| `size_gb` | int | нет | Размер в ГБ |
|
||||||
|
| `logical_block_size_bytes` | int64 | нет | Логический размер пользовательского блока данных, например `512` или `4096` |
|
||||||
|
| `physical_block_size_bytes` | int64 | нет | Физический размер блока, если известен, например `4096` |
|
||||||
|
| `metadata_bytes_per_block` | int64 | нет | Metadata / protection bytes на логический блок, например `0` или `8` |
|
||||||
| `temperature_c` | float | нет | Температура накопителя, °C (telemetry) |
|
| `temperature_c` | float | нет | Температура накопителя, °C (telemetry) |
|
||||||
| `power_on_hours` | int64 | нет | Время работы, часы |
|
| `power_on_hours` | int64 | нет | Время работы, часы |
|
||||||
| `power_cycles` | int64 | нет | Количество циклов питания |
|
| `power_cycles` | int64 | нет | Количество циклов питания |
|
||||||
@@ -363,6 +370,11 @@ GET /ingest/hardware/jobs/{job_id}
|
|||||||
|
|
||||||
Диск без `serial_number` игнорируется. Изменение `firmware` создаёт событие `FIRMWARE_CHANGED`.
|
Диск без `serial_number` игнорируется. Изменение `firmware` создаёт событие `FIRMWARE_CHANGED`.
|
||||||
|
|
||||||
|
Формат вида `512+8` в контракт не добавляется отдельным строковым полем. Если источник знает такую форму, он должен передавать её как:
|
||||||
|
- `logical_block_size_bytes = 512`
|
||||||
|
- `metadata_bytes_per_block = 8`
|
||||||
|
- `physical_block_size_bytes = 512` или `4096`, если известен физический размер блока
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"storage": [
|
"storage": [
|
||||||
{
|
{
|
||||||
@@ -370,6 +382,9 @@ GET /ingest/hardware/jobs/{job_id}
|
|||||||
"type": "NVMe",
|
"type": "NVMe",
|
||||||
"model": "INTEL SSDPF2KX076T1",
|
"model": "INTEL SSDPF2KX076T1",
|
||||||
"size_gb": 7680,
|
"size_gb": 7680,
|
||||||
|
"logical_block_size_bytes": 512,
|
||||||
|
"physical_block_size_bytes": 4096,
|
||||||
|
"metadata_bytes_per_block": 8,
|
||||||
"temperature_c": 38.5,
|
"temperature_c": 38.5,
|
||||||
"power_on_hours": 12450,
|
"power_on_hours": 12450,
|
||||||
"unsafe_shutdowns": 3,
|
"unsafe_shutdowns": 3,
|
||||||
@@ -592,7 +607,6 @@ PSU без `serial_number` игнорируется.
|
|||||||
| Поле | Тип | Обязательно | Описание |
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|------|-----|-------------|----------|
|
|------|-----|-------------|----------|
|
||||||
| `name` | string | **да** | Уникальное имя сенсора в рамках секции |
|
| `name` | string | **да** | Уникальное имя сенсора в рамках секции |
|
||||||
| `location` | string | нет | Физическое расположение |
|
|
||||||
| `rpm` | int | нет | Обороты, RPM |
|
| `rpm` | int | нет | Обороты, RPM |
|
||||||
| `status` | string | нет | Статус: `OK`, `Warning`, `Critical`, `Unknown` |
|
| `status` | string | нет | Статус: `OK`, `Warning`, `Critical`, `Unknown` |
|
||||||
|
|
||||||
@@ -601,7 +615,6 @@ PSU без `serial_number` игнорируется.
|
|||||||
| Поле | Тип | Обязательно | Описание |
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|------|-----|-------------|----------|
|
|------|-----|-------------|----------|
|
||||||
| `name` | string | **да** | Уникальное имя сенсора |
|
| `name` | string | **да** | Уникальное имя сенсора |
|
||||||
| `location` | string | нет | Физическое расположение |
|
|
||||||
| `voltage_v` | float | нет | Напряжение, В |
|
| `voltage_v` | float | нет | Напряжение, В |
|
||||||
| `current_a` | float | нет | Ток, А |
|
| `current_a` | float | нет | Ток, А |
|
||||||
| `power_w` | float | нет | Мощность, Вт |
|
| `power_w` | float | нет | Мощность, Вт |
|
||||||
@@ -612,7 +625,6 @@ PSU без `serial_number` игнорируется.
|
|||||||
| Поле | Тип | Обязательно | Описание |
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|------|-----|-------------|----------|
|
|------|-----|-------------|----------|
|
||||||
| `name` | string | **да** | Уникальное имя сенсора |
|
| `name` | string | **да** | Уникальное имя сенсора |
|
||||||
| `location` | string | нет | Физическое расположение |
|
|
||||||
| `celsius` | float | нет | Температура, °C |
|
| `celsius` | float | нет | Температура, °C |
|
||||||
| `threshold_warning_celsius` | float | нет | Порог Warning, °C |
|
| `threshold_warning_celsius` | float | нет | Порог Warning, °C |
|
||||||
| `threshold_critical_celsius` | float | нет | Порог Critical, °C |
|
| `threshold_critical_celsius` | float | нет | Порог Critical, °C |
|
||||||
@@ -623,29 +635,29 @@ PSU без `serial_number` игнорируется.
|
|||||||
| Поле | Тип | Обязательно | Описание |
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|------|-----|-------------|----------|
|
|------|-----|-------------|----------|
|
||||||
| `name` | string | **да** | Уникальное имя сенсора |
|
| `name` | string | **да** | Уникальное имя сенсора |
|
||||||
| `location` | string | нет | Физическое расположение |
|
|
||||||
| `value` | float | нет | Значение |
|
| `value` | float | нет | Значение |
|
||||||
| `unit` | string | нет | Единица измерения |
|
| `unit` | string | нет | Единица измерения |
|
||||||
| `status` | string | нет | Статус |
|
| `status` | string | нет | Статус |
|
||||||
|
|
||||||
**Правила sensors:**
|
**Правила sensors:**
|
||||||
- Идентификатор сенсора: пара `(sensor_type, name)`. Дубли в одном payload — берётся первое вхождение.
|
- Идентификатор сенсора: пара `(sensor_type, name)`. Дубли в одном payload — берётся первое вхождение.
|
||||||
|
- `location` для сенсоров передавать не нужно и не следует: в Reanimator location/slot используется только для проверки перемещения и установки компонентов, а не для last-known-value sensor ingest.
|
||||||
- Сенсоры без `name` игнорируются.
|
- Сенсоры без `name` игнорируются.
|
||||||
- При каждом импорте значения перезаписываются (upsert по ключу).
|
- При каждом импорте значения перезаписываются (upsert по ключу).
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"sensors": {
|
"sensors": {
|
||||||
"fans": [
|
"fans": [
|
||||||
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" },
|
{ "name": "FAN1", "rpm": 4200, "status": "OK" },
|
||||||
{ "name": "FAN_CPU0", "location": "CPU0", "rpm": 5600, "status": "OK" }
|
{ "name": "FAN_CPU0", "rpm": 5600, "status": "OK" }
|
||||||
],
|
],
|
||||||
"power": [
|
"power": [
|
||||||
{ "name": "12V Rail", "location": "Mainboard", "voltage_v": 12.06, "status": "OK" },
|
{ "name": "12V Rail", "voltage_v": 12.06, "status": "OK" },
|
||||||
{ "name": "PSU0 Input", "location": "PSU0", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
|
{ "name": "PSU0 Input", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
|
||||||
],
|
],
|
||||||
"temperatures": [
|
"temperatures": [
|
||||||
{ "name": "CPU0 Temp", "location": "CPU0", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
|
{ "name": "CPU0 Temp", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
|
||||||
{ "name": "Inlet Temp", "location": "Front", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
|
{ "name": "Inlet Temp", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
|
||||||
],
|
],
|
||||||
"other": [
|
"other": [
|
||||||
{ "name": "System Humidity", "value": 38.5, "unit": "%", "status": "OK" }
|
{ "name": "System Humidity", "value": 38.5, "unit": "%", "status": "OK" }
|
||||||
@@ -655,6 +667,31 @@ PSU без `serial_number` игнорируется.
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Секция platform_config
|
||||||
|
|
||||||
|
Необязательный объект с произвольными ключами — настройки платформы как есть из источника (BIOS, Redfish, IPMI).
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `platform_config` | object | нет | Произвольный объект: ключи — строки, значения — строки, числа, булевы |
|
||||||
|
|
||||||
|
**Правила platform_config:**
|
||||||
|
- Содержимое объекта не валидируется: передавайте параметры как есть.
|
||||||
|
- При каждом импорте хранится latest-snapshot per machine; история изменений по каждому ключу накапливается отдельно.
|
||||||
|
- Если секция отсутствует или равна `null` — данные платформы не обновляются.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"platform_config": {
|
||||||
|
"SecureBoot": "Enabled",
|
||||||
|
"BiosVersion": "06.08.05",
|
||||||
|
"TpmEnabled": true,
|
||||||
|
"NumaEnabled": false,
|
||||||
|
"HyperThreading": "Enabled"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Обработка статусов компонентов
|
## Обработка статусов компонентов
|
||||||
|
|
||||||
| Статус | Поведение |
|
| Статус | Поведение |
|
||||||
@@ -787,6 +824,12 @@ PSU без `serial_number` игнорируется.
|
|||||||
"other": [
|
"other": [
|
||||||
{ "name": "System Humidity", "value": 38.5, "unit": "%" }
|
{ "name": "System Humidity", "value": 38.5, "unit": "%" }
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
"platform_config": {
|
||||||
|
"SecureBoot": "Enabled",
|
||||||
|
"BiosVersion": "06.08.05",
|
||||||
|
"TpmEnabled": true,
|
||||||
|
"HyperThreading": "Enabled"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
31
bible-local/rules/patterns/ascii-safe-text/contract.md
Normal file
31
bible-local/rules/patterns/ascii-safe-text/contract.md
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
# Contract: ASCII-Safe Text in Scripts and Boot Configs
|
||||||
|
|
||||||
|
Version: 1.0
|
||||||
|
|
||||||
|
## Principle
|
||||||
|
|
||||||
|
Shell scripts, bootloader configs, and any text rendered on serial/SOL consoles must use only printable ASCII characters. Non-ASCII Unicode — including typographic punctuation such as the em-dash (U+2014 `—`), en-dash (U+2013 `–`), curly quotes, and ellipsis (U+2026 `…`) — breaks rendering on serial terminals, GRUB text/serial mode, IPMI SOL, and tooling that assumes ASCII.
|
||||||
|
|
||||||
|
## Rules
|
||||||
|
|
||||||
|
- Never use em-dash (`—`) or en-dash (`–`) in any shell script, GRUB config, syslinux/isolinux config, or service unit file. Use ASCII double-hyphen `--` or single hyphen `-` instead.
|
||||||
|
- Never use curly quotes (`"` `"` `'` `'`) in shell scripts or configs. Use straight quotes `"` and `'`.
|
||||||
|
- Never use the Unicode ellipsis (`…`). Use `...`.
|
||||||
|
- GRUB `menuentry` and `submenu` titles must be ASCII-only — GRUB serial terminal output is ASCII; non-ASCII characters render as garbage or are dropped.
|
||||||
|
- Comments in GRUB theme files (`.txt`) must also be ASCII-only, as GRUB may parse the entire file.
|
||||||
|
|
||||||
|
## Why
|
||||||
|
|
||||||
|
GRUB renders menus over both `gfxterm` (graphical, Unicode-capable) and `serial` (ASCII-only) simultaneously when `terminal_output gfxterm serial` is set. The serial output — used by IPMI SOL and BMC remote consoles — cannot display multi-byte UTF-8 sequences and shows raw bytes or drops characters. A menuentry title `"EASY-BEE — GSP=off"` appears as `"EASY-BEE â€" GSP=off"` or `"EASY-BEE GSP=off"` on SOL, making the menu unreadable.
|
||||||
|
|
||||||
|
## Anti-patterns
|
||||||
|
|
||||||
|
- `menuentry "EASY-BEE — GSP=off"` — em-dash in GRUB title
|
||||||
|
- `# bee logo — centered` — em-dash in GRUB theme comment
|
||||||
|
- `echo "done — reboot"` in a shell script displayed over serial
|
||||||
|
|
||||||
|
## Correct form
|
||||||
|
|
||||||
|
- `menuentry "EASY-BEE -- GSP=off"`
|
||||||
|
- `# bee logo - centered`
|
||||||
|
- `echo "done - reboot"`
|
||||||
134
git-bible/grub-bitmap-error.md
Normal file
134
git-bible/grub-bitmap-error.md
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
# GRUB bitmap error: null src bitmap in grub_video_bitmap_create_scaled
|
||||||
|
|
||||||
|
## Symptom
|
||||||
|
|
||||||
|
```
|
||||||
|
error: null src bitmap in grub_video_bitmap_create_scaled.
|
||||||
|
Press any key to continue...
|
||||||
|
```
|
||||||
|
|
||||||
|
Appears on boot before the GRUB menu renders. The menu still appears after pressing a key,
|
||||||
|
but without the bee logo. Reproduced on real hardware (Lenovo SR650 V3, ASUS GPU servers).
|
||||||
|
|
||||||
|
## Root cause model
|
||||||
|
|
||||||
|
`grub_video_bitmap_create_scaled` receives a null `src` pointer, meaning the PNG loader
|
||||||
|
returned null for `bee-logo.png`. GRUB calls this function even when no explicit
|
||||||
|
`width`/`height` are set in `theme.txt` — it is invoked any time an image component is
|
||||||
|
rendered, passing the image's natural dimensions as the target size.
|
||||||
|
|
||||||
|
The PNG file is referenced as `file = "bee-logo.png"` (relative to theme dir).
|
||||||
|
GRUB resolves this to `/boot/grub/live-theme/bee-logo.png`.
|
||||||
|
|
||||||
|
## Attempts that did NOT fix the error
|
||||||
|
|
||||||
|
### Attempt 1 — add explicit `width`/`height` to image block (d52ec67)
|
||||||
|
|
||||||
|
**What was done:** First introduction of bee-logo.png with:
|
||||||
|
```
|
||||||
|
+ image {
|
||||||
|
top = 4%
|
||||||
|
left = 50%-200
|
||||||
|
width = 400
|
||||||
|
height = 400
|
||||||
|
file = "bee-logo.png"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
PNG at this point was RGBA (color_type=6).
|
||||||
|
|
||||||
|
**Result:** Error appeared immediately on first ISO build.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Attempt 2 — remove `width`/`height` from image block (aa284ae)
|
||||||
|
|
||||||
|
**Hypothesis:** Explicit scaling dimensions trigger the scale path; removing them avoids it.
|
||||||
|
|
||||||
|
**What was done:** Removed `width = 400` and `height = 400` from the image block.
|
||||||
|
```
|
||||||
|
+ image {
|
||||||
|
top = 4%
|
||||||
|
left = 50%-200
|
||||||
|
file = "bee-logo.png"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Result:** Error persists. GRUB calls `grub_video_bitmap_create_scaled` regardless of whether
|
||||||
|
`width`/`height` are specified — if the bitmap is null (loading failed), the error fires either way.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Attempt 3 — convert PNG to RGBA + strip metadata chunks (6112094)
|
||||||
|
|
||||||
|
**Hypothesis:** GRUB's minimal PNG parser is confused by metadata chunks (cHRM, bKGD, tIME, tEXt).
|
||||||
|
Also re-ordered `terminal_output gfxterm` before `insmod png` / theme load.
|
||||||
|
|
||||||
|
**What was done:**
|
||||||
|
- Converted PNG to RGBA color_type=6, stripped all ancillary chunks
|
||||||
|
- Moved `terminal_output gfxterm` earlier in config.cfg
|
||||||
|
- Removed echo ASCII art banner from grub.cfg
|
||||||
|
|
||||||
|
**Result:** Error persists — and this change actually confirmed RGBA does not work:
|
||||||
|
GRUB's PNG loader does not render RGBA PNGs correctly on this platform.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Attempt 4 — convert PNG from RGBA back to RGB (333c44f, most recent)
|
||||||
|
|
||||||
|
**Hypothesis:** GRUB does not support RGBA (color_type=6); RGB (color_type=2) is the correct format.
|
||||||
|
Alpha channel composited onto black background (#000000) to match `desktop-color`.
|
||||||
|
|
||||||
|
**What was done:** Converted bee-logo.png from RGBA to RGB via ImageMagick.
|
||||||
|
|
||||||
|
**Current file state:**
|
||||||
|
- 400×400 px, 8-bit/color RGB, non-interlaced
|
||||||
|
- Only IHDR + IDAT + IEND chunks (no metadata)
|
||||||
|
- `insmod png` is present in config.cfg
|
||||||
|
- `terminal_output gfxterm` runs before theme is sourced
|
||||||
|
- No explicit `width`/`height` in image block
|
||||||
|
|
||||||
|
**Result:** Error still occurs on real hardware. Despite the PNG being nominally correct
|
||||||
|
(RGB, non-interlaced, minimal chunks), the bitmap load returns null.
|
||||||
|
|
||||||
|
## Confirmed root cause (verified on 172.16.41.94, 2026-04-30)
|
||||||
|
|
||||||
|
The EFI partition (`sda2`, vfat, 5 MB) contains only:
|
||||||
|
```
|
||||||
|
/EFI/boot/bootia32.efi
|
||||||
|
/EFI/boot/bootx64.efi
|
||||||
|
/EFI/boot/grubx64.efi
|
||||||
|
/boot/grub/grub.cfg
|
||||||
|
```
|
||||||
|
|
||||||
|
`config.cfg`, `theme.cfg`, and the entire `live-theme/` directory (including `bee-logo.png`)
|
||||||
|
are **absent from the EFI image**. `live-build`'s `lb binary_grub-efi` stage is not
|
||||||
|
copying these files. GRUB boots, sources only `grub.cfg`, then fails to load the theme
|
||||||
|
because the file does not exist — returning a null bitmap regardless of PNG format.
|
||||||
|
|
||||||
|
All four fix attempts were targeting the wrong layer (PNG format/content).
|
||||||
|
|
||||||
|
## Fix (applied 2026-04-30)
|
||||||
|
|
||||||
|
Switched from PNG to TGA format:
|
||||||
|
|
||||||
|
1. Converted `bee-logo.png` → `bee-logo.tga` (24-bit uncompressed BGR, top-left origin,
|
||||||
|
480018 bytes). Conversion done via Python stdlib (no external tools needed).
|
||||||
|
2. `config.cfg`: `insmod png` → `insmod tga`
|
||||||
|
3. `theme.txt`: `file = "bee-logo.png"` → `file = "bee-logo.tga"`
|
||||||
|
|
||||||
|
**Why TGA works:** GRUB's TGA reader (`tga.mod`) handles uncompressed 24-bit images
|
||||||
|
trivially — no decompression, no complex chunk parsing. The module is present on-disk
|
||||||
|
(`x86_64-efi/tga.mod`). PNG was failing despite a valid file; the exact GRUB bug is
|
||||||
|
unknown but the PNG reader in Debian bookworm's grub2 is known to be fragile.
|
||||||
|
|
||||||
|
The old `bee-logo.png` is kept in the tree (may be useful for other tools) but is no
|
||||||
|
longer referenced by the theme.
|
||||||
|
|
||||||
|
## Relevant files
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `iso/builder/config/bootloaders/grub-efi/config.cfg` | insmod png, gfxterm init, theme source |
|
||||||
|
| `iso/builder/config/bootloaders/grub-efi/theme.cfg` | sets `theme=` path |
|
||||||
|
| `iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt` | image component definition |
|
||||||
|
| `iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png` | the logo PNG |
|
||||||
Submodule internal/chart updated: ac8120c8ab...2a15bc87f1
@@ -31,10 +31,10 @@ Build with explicit SSH keys baked into the ISO:
|
|||||||
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
|
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
|
||||||
```
|
```
|
||||||
|
|
||||||
Rebuild the builder image:
|
Force a clean rebuild of the builder image and build caches:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
sh iso/builder/build-in-container.sh --rebuild-image
|
sh iso/builder/build-in-container.sh --clean-build
|
||||||
```
|
```
|
||||||
|
|
||||||
Use a custom cache directory:
|
Use a custom cache directory:
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
|||||||
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
REBUILD_IMAGE=0
|
|
||||||
CLEAN_CACHE=0
|
CLEAN_CACHE=0
|
||||||
VARIANT="all"
|
VARIANT="all"
|
||||||
|
|
||||||
@@ -22,17 +21,12 @@ while [ $# -gt 0 ]; do
|
|||||||
CACHE_DIR="$2"
|
CACHE_DIR="$2"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
--rebuild-image)
|
|
||||||
REBUILD_IMAGE=1
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--authorized-keys)
|
--authorized-keys)
|
||||||
AUTH_KEYS="$2"
|
AUTH_KEYS="$2"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
--clean-build)
|
--clean-build)
|
||||||
CLEAN_CACHE=1
|
CLEAN_CACHE=1
|
||||||
REBUILD_IMAGE=1
|
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
--variant)
|
--variant)
|
||||||
@@ -41,7 +35,7 @@ while [ $# -gt 0 ]; do
|
|||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "unknown arg: $1" >&2
|
echo "unknown arg: $1" >&2
|
||||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
echo "usage: $0 [--cache-dir /path] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
@@ -105,7 +99,7 @@ image_matches_platform() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
NEED_BUILD_IMAGE=0
|
NEED_BUILD_IMAGE=0
|
||||||
if [ "$REBUILD_IMAGE" = "1" ]; then
|
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||||
NEED_BUILD_IMAGE=1
|
NEED_BUILD_IMAGE=1
|
||||||
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||||
NEED_BUILD_IMAGE=1
|
NEED_BUILD_IMAGE=1
|
||||||
|
|||||||
@@ -126,6 +126,37 @@ resolve_iso_version() {
|
|||||||
resolve_audit_version
|
resolve_audit_version
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sync_builder_workdir() {
|
||||||
|
src_dir="$1"
|
||||||
|
dst_dir="$2"
|
||||||
|
|
||||||
|
mkdir -p "$dst_dir"
|
||||||
|
|
||||||
|
# Historical bug: old workdirs could keep config/bootloaders/grub-pc even
|
||||||
|
# after the source tree moved to grub-efi only. Remove bootloaders eagerly
|
||||||
|
# so reused workdirs cannot leak stale templates into a new ISO build.
|
||||||
|
rm -rf "$dst_dir/config/bootloaders"
|
||||||
|
|
||||||
|
rsync -a --delete \
|
||||||
|
--exclude='cache/' \
|
||||||
|
--exclude='chroot/' \
|
||||||
|
--exclude='.build/' \
|
||||||
|
--exclude='*.iso' \
|
||||||
|
--exclude='*.packages' \
|
||||||
|
--exclude='*.contents' \
|
||||||
|
--exclude='*.files' \
|
||||||
|
"$src_dir/" "$dst_dir/"
|
||||||
|
|
||||||
|
if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
|
||||||
|
echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
|
||||||
|
echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
iso_list_files() {
|
iso_list_files() {
|
||||||
iso_path="$1"
|
iso_path="$1"
|
||||||
|
|
||||||
@@ -466,6 +497,75 @@ validate_iso_memtest() {
|
|||||||
echo "=== memtest validation OK ==="
|
echo "=== memtest validation OK ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
|
validate_iso_live_boot_entries() {
|
||||||
|
iso_path="$1"
|
||||||
|
echo "=== validating live boot entries in ISO ==="
|
||||||
|
|
||||||
|
[ -f "$iso_path" ] || {
|
||||||
|
echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
|
||||||
|
echo "ERROR: ISO reader unavailable for live boot validation" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
grub_cfg="$(mktemp)"
|
||||||
|
isolinux_cfg="$(mktemp)"
|
||||||
|
|
||||||
|
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||||
|
echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||||
|
echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
|
||||||
|
echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB toram entry is missing" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'linux .*boot=live ' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB live entry is missing boot=live" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -q 'append .*boot=live ' "$isolinux_cfg" || {
|
||||||
|
echo "ERROR: isolinux live entry is missing boot=live" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
|
||||||
|
echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
echo "=== live boot validation OK ==="
|
||||||
|
}
|
||||||
|
|
||||||
validate_iso_nvidia_runtime() {
|
validate_iso_nvidia_runtime() {
|
||||||
iso_path="$1"
|
iso_path="$1"
|
||||||
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
||||||
@@ -558,6 +658,21 @@ extract_live_grub_entry() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
load_live_build_append() {
|
||||||
|
lb_dir="$1"
|
||||||
|
binary_cfg="$lb_dir/config/binary"
|
||||||
|
[ -f "$binary_cfg" ] || return 1
|
||||||
|
|
||||||
|
# config/binary is generated by live-build and contains shell variable
|
||||||
|
# assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
|
||||||
|
# shellcheck disable=SC1090
|
||||||
|
. "$binary_cfg"
|
||||||
|
|
||||||
|
[ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
|
||||||
|
live_build_append="$LB_BOOTAPPEND_LIVE"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
extract_live_isolinux_entry() {
|
extract_live_isolinux_entry() {
|
||||||
cfg="$1"
|
cfg="$1"
|
||||||
isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||||
@@ -594,36 +709,15 @@ echo " Hardware Audit LiveCD"
|
|||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE — load to RAM (toram)" {
|
menuentry "EASY-BEE -- load to RAM (toram)" {
|
||||||
linux ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
}
|
}
|
||||||
|
|
||||||
submenu "EASY-BEE (advanced options) -->" {
|
|
||||||
menuentry "EASY-BEE — GSP=off" {
|
|
||||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd ${initrd}
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
|
||||||
linux ${kernel} ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd ${initrd}
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — KMS + GSP=off" {
|
|
||||||
linux ${kernel} ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd ${initrd}
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — fail-safe" {
|
|
||||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
|
||||||
initrd ${initrd}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ "\${grub_platform}" = "efi" ]; then
|
if [ "\${grub_platform}" = "efi" ]; then
|
||||||
menuentry "Memory Test (memtest86+)" {
|
menuentry "Memory Test (memtest86+)" {
|
||||||
@@ -699,13 +793,18 @@ enforce_live_build_bootloader_assets() {
|
|||||||
grub_dir="$lb_dir/binary/boot/grub"
|
grub_dir="$lb_dir/binary/boot/grub"
|
||||||
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
||||||
|
|
||||||
|
if ! load_live_build_append "$lb_dir"; then
|
||||||
|
echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
|
||||||
|
live_build_append=""
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -f "$grub_cfg" ]; then
|
if [ -f "$grub_cfg" ]; then
|
||||||
if extract_live_grub_entry "$grub_cfg"; then
|
if extract_live_grub_entry "$grub_cfg"; then
|
||||||
mkdir -p "$grub_dir/live-theme"
|
mkdir -p "$grub_dir/live-theme"
|
||||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
||||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
|
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
|
||||||
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
|
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
|
||||||
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "$grub_append" "$grub_initrd"
|
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
|
||||||
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
||||||
else
|
else
|
||||||
echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
|
echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
|
||||||
@@ -714,7 +813,7 @@ enforce_live_build_bootloader_assets() {
|
|||||||
|
|
||||||
if [ -f "$isolinux_cfg" ]; then
|
if [ -f "$isolinux_cfg" ]; then
|
||||||
if extract_live_isolinux_entry "$isolinux_cfg"; then
|
if extract_live_isolinux_entry "$isolinux_cfg"; then
|
||||||
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "$isolinux_append"
|
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
|
||||||
echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
|
echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
|
||||||
else
|
else
|
||||||
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
||||||
@@ -749,6 +848,73 @@ reset_live_build_stage() {
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Marker written after every successful full lb build for this variant
|
||||||
|
FULL_BUILD_MARKER="${BUILD_WORK_DIR}/.bee-full-build-marker"
|
||||||
|
|
||||||
|
# Returns 0 if full lb build is needed, 1 if fast-path is safe.
|
||||||
|
# Fast-path is safe when only light files changed since the last full build
|
||||||
|
# (Go source, overlay scripts/configs). Heavy changes (VERSIONS, package lists,
|
||||||
|
# hooks, archives, Dockerfile, auto/config) require a full lb build.
|
||||||
|
needs_full_build() {
|
||||||
|
[ -f "${FULL_BUILD_MARKER}" ] || return 0
|
||||||
|
[ -f "${BUILD_WORK_DIR}/binary/live/filesystem.squashfs" ] || return 0
|
||||||
|
[ -f "${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso" ] || return 0
|
||||||
|
|
||||||
|
_heavy=$(find \
|
||||||
|
"${BUILDER_DIR}/VERSIONS" \
|
||||||
|
"${BUILDER_DIR}/auto/config" \
|
||||||
|
"${BUILDER_DIR}/Dockerfile" \
|
||||||
|
"${BUILDER_DIR}/config/package-lists" \
|
||||||
|
"${BUILDER_DIR}/config/hooks" \
|
||||||
|
"${BUILDER_DIR}/config/archives" \
|
||||||
|
"${BUILDER_DIR}/config/bootloaders" \
|
||||||
|
-newer "${FULL_BUILD_MARKER}" 2>/dev/null | head -1)
|
||||||
|
|
||||||
|
if [ -n "$_heavy" ]; then
|
||||||
|
echo "=== full build required: heavy config changed: $(basename "$_heavy") ==="
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fast-path: unsquash existing filesystem, rsync overlay on top, repack.
|
||||||
|
# Requires ~10 GB free in BEE_CACHE_DIR for the unpacked squashfs.
|
||||||
|
fast_path_repack_squashfs() {
|
||||||
|
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
||||||
|
_tmp="${BEE_CACHE_DIR}/fast-unsquash-${BUILD_VARIANT}"
|
||||||
|
echo "=== fast-path: unsquash ($(du -sh "$_sq" | cut -f1) compressed) ==="
|
||||||
|
rm -rf "$_tmp"
|
||||||
|
unsquashfs -d "$_tmp" "$_sq"
|
||||||
|
echo "=== fast-path: syncing overlay stage ==="
|
||||||
|
rsync -a --checksum "${OVERLAY_STAGE_DIR}/" "$_tmp/"
|
||||||
|
echo "=== fast-path: repacking squashfs ==="
|
||||||
|
_sq_new="${_sq}.new"
|
||||||
|
rm -f "$_sq_new"
|
||||||
|
mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress
|
||||||
|
mv "$_sq_new" "$_sq"
|
||||||
|
rm -rf "$_tmp"
|
||||||
|
echo "=== fast-path: squashfs repacked ($(du -sh "$_sq" | cut -f1)) ==="
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fast-path: rebuild ISO by replacing only live/filesystem.squashfs via xorriso.
|
||||||
|
# Boot structure (El Torito, EFI, MBR hybrid) is replayed from the prior ISO.
|
||||||
|
fast_path_rebuild_iso() {
|
||||||
|
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
||||||
|
_prior="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso"
|
||||||
|
_new="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso.new"
|
||||||
|
echo "=== fast-path: rebuilding ISO with xorriso ==="
|
||||||
|
rm -f "$_new"
|
||||||
|
xorriso \
|
||||||
|
-indev "$_prior" \
|
||||||
|
-outdev "$_new" \
|
||||||
|
-map "$_sq" /live/filesystem.squashfs \
|
||||||
|
-boot_image any replay \
|
||||||
|
-commit
|
||||||
|
mv "$_new" "$_prior"
|
||||||
|
echo "=== fast-path: ISO rebuilt ==="
|
||||||
|
}
|
||||||
|
|
||||||
recover_iso_memtest() {
|
recover_iso_memtest() {
|
||||||
lb_dir="$1"
|
lb_dir="$1"
|
||||||
iso_path="$2"
|
iso_path="$2"
|
||||||
@@ -1112,15 +1278,7 @@ echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
|||||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||||
|
|
||||||
# Sync builder config into variant work dir, preserving lb cache.
|
# Sync builder config into variant work dir, preserving lb cache.
|
||||||
rsync -a --delete \
|
sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"
|
||||||
--exclude='cache/' \
|
|
||||||
--exclude='chroot/' \
|
|
||||||
--exclude='.build/' \
|
|
||||||
--exclude='*.iso' \
|
|
||||||
--exclude='*.packages' \
|
|
||||||
--exclude='*.contents' \
|
|
||||||
--exclude='*.files' \
|
|
||||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
|
||||||
|
|
||||||
# Share deb package cache across variants.
|
# Share deb package cache across variants.
|
||||||
# Restore: populate work dir cache from shared cache before build.
|
# Restore: populate work dir cache from shared cache before build.
|
||||||
@@ -1396,6 +1554,21 @@ if [ -f "${LB_INCLUDES}/root/.ssh/authorized_keys" ]; then
|
|||||||
chmod 600 "${LB_INCLUDES}/root/.ssh/authorized_keys"
|
chmod 600 "${LB_INCLUDES}/root/.ssh/authorized_keys"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# --- auto fast-path: squashfs surgery if only light files changed ---
|
||||||
|
if ! needs_full_build; then
|
||||||
|
echo "=== fast-path build (no heavy config changes since last full build) ==="
|
||||||
|
fast_path_repack_squashfs
|
||||||
|
fast_path_rebuild_iso
|
||||||
|
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||||
|
validate_iso_live_boot_entries "$ISO_RAW"
|
||||||
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
|
echo ""
|
||||||
|
echo "=== done (${BUILD_VARIANT}, fast-path) ==="
|
||||||
|
echo "ISO: $ISO_OUT"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
# --- build ISO using live-build ---
|
# --- build ISO using live-build ---
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||||
@@ -1411,8 +1584,11 @@ dump_memtest_debug "pre-build" "${LB_DIR}"
|
|||||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||||
echo "=== enforcing canonical bootloader assets ==="
|
echo "=== enforcing canonical bootloader assets ==="
|
||||||
enforce_live_build_bootloader_assets "${LB_DIR}"
|
enforce_live_build_bootloader_assets "${LB_DIR}"
|
||||||
|
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
||||||
|
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
||||||
|
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
||||||
run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
|
run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
|
||||||
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "rm -f '${LB_DIR}/live-image-amd64.hybrid.iso' && lb binary_iso 2>&1"
|
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
|
||||||
run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
|
run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
|
||||||
|
|
||||||
# --- persist deb package cache back to shared location ---
|
# --- persist deb package cache back to shared location ---
|
||||||
@@ -1438,8 +1614,10 @@ if [ -f "$ISO_RAW" ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
validate_iso_memtest "$ISO_RAW"
|
validate_iso_memtest "$ISO_RAW"
|
||||||
|
validate_iso_live_boot_entries "$ISO_RAW"
|
||||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
|
touch "${FULL_BUILD_MARKER}"
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== done (${BUILD_VARIANT}) ==="
|
echo "=== done (${BUILD_VARIANT}) ==="
|
||||||
echo "ISO: $ISO_OUT"
|
echo "ISO: $ISO_OUT"
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ insmod serial
|
|||||||
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
||||||
|
|
||||||
insmod gfxterm
|
insmod gfxterm
|
||||||
insmod png
|
|
||||||
|
|
||||||
source /boot/grub/theme.cfg
|
|
||||||
|
|
||||||
terminal_input console serial
|
terminal_input console serial
|
||||||
terminal_output gfxterm serial
|
terminal_output gfxterm serial
|
||||||
|
|
||||||
|
insmod tga
|
||||||
|
source /boot/grub/theme.cfg
|
||||||
|
|||||||
@@ -1,47 +1,16 @@
|
|||||||
source /boot/grub/config.cfg
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
|
||||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
|
||||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
|
||||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
|
||||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
|
||||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
|
||||||
echo " Hardware Audit LiveCD"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
submenu "EASY-BEE (advanced options) -->" {
|
menuentry "EASY-BEE -- load to RAM (toram)" {
|
||||||
menuentry "EASY-BEE — load to RAM (toram)" {
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
initrd @INITRD_LIVE@
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — GSP=off" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — KMS + GSP=off" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — fail-safe" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
menuentry "Memory Test (memtest86+)" {
|
menuentry "Memory Test (memtest86+)" {
|
||||||
chainloader /boot/memtest86+x64.efi
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 70 KiB After Width: | Height: | Size: 77 KiB |
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.tga
Normal file
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.tga
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 469 KiB |
@@ -5,13 +5,11 @@ title-text: ""
|
|||||||
message-font: "Unifont Regular 16"
|
message-font: "Unifont Regular 16"
|
||||||
terminal-font: "Unifont Regular 16"
|
terminal-font: "Unifont Regular 16"
|
||||||
|
|
||||||
#bee logo — centered, upper third of screen
|
#bee logo - centered, upper third of screen
|
||||||
+ image {
|
+ image {
|
||||||
top = 4%
|
top = 4%
|
||||||
left = 50%-200
|
left = 50%-200
|
||||||
width = 400
|
file = "bee-logo.tga"
|
||||||
height = 400
|
|
||||||
file = "bee-logo.png"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#help bar at the bottom
|
#help bar at the bottom
|
||||||
@@ -36,11 +34,11 @@ terminal-font: "Unifont Regular 16"
|
|||||||
item_font = "Unifont Regular 16"
|
item_font = "Unifont Regular 16"
|
||||||
selected_item_color= "#f5a800"
|
selected_item_color= "#f5a800"
|
||||||
selected_item_font = "Unifont Regular 16"
|
selected_item_font = "Unifont Regular 16"
|
||||||
item_height = 16
|
item_height = 20
|
||||||
item_padding = 0
|
item_padding = 2
|
||||||
item_spacing = 4
|
item_spacing = 4
|
||||||
icon_width = 0
|
icon_width = 0
|
||||||
icon_heigh = 0
|
icon_height = 0
|
||||||
item_icon_space = 0
|
item_icon_space = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ systemctl enable bee-preflight.service
|
|||||||
systemctl enable bee-audit.service
|
systemctl enable bee-audit.service
|
||||||
systemctl enable bee-web.service
|
systemctl enable bee-web.service
|
||||||
systemctl enable bee-sshsetup.service
|
systemctl enable bee-sshsetup.service
|
||||||
|
systemctl enable bee-blackbox.service
|
||||||
systemctl enable bee-selfheal.timer
|
systemctl enable bee-selfheal.timer
|
||||||
systemctl enable bee-boot-status.service
|
systemctl enable bee-boot-status.service
|
||||||
systemctl enable ssh.service
|
systemctl enable ssh.service
|
||||||
|
|||||||
@@ -47,18 +47,27 @@ vim-tiny
|
|||||||
mc
|
mc
|
||||||
htop
|
htop
|
||||||
nvtop
|
nvtop
|
||||||
btop
|
|
||||||
sudo
|
sudo
|
||||||
zstd
|
zstd
|
||||||
mstflint
|
mstflint
|
||||||
memtester
|
memtester
|
||||||
stress-ng
|
stress-ng
|
||||||
stressapptest
|
stressapptest
|
||||||
|
fio
|
||||||
# QR codes (for displaying audit results)
|
iperf3
|
||||||
qrencode
|
iotop
|
||||||
|
nload
|
||||||
|
tcpdump
|
||||||
|
hdparm
|
||||||
|
sysstat
|
||||||
|
lsscsi
|
||||||
|
sg3-utils
|
||||||
|
jq
|
||||||
|
curl
|
||||||
|
net-tools
|
||||||
|
|
||||||
# Local desktop (openbox + chromium kiosk)
|
# Local desktop (openbox + chromium kiosk)
|
||||||
|
gparted
|
||||||
openbox
|
openbox
|
||||||
tint2
|
tint2
|
||||||
feh
|
feh
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit
|
Description=Bee: hardware audit
|
||||||
After=bee-preflight.service bee-network.service bee-nvidia.service
|
After=bee-preflight.service bee-network.service bee-nvidia.service bee-blackbox.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
|
|||||||
18
iso/overlay/etc/systemd/system/bee-blackbox.service
Normal file
18
iso/overlay/etc/systemd/system/bee-blackbox.service
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Bee: USB black-box log mirror
|
||||||
|
After=local-fs.target
|
||||||
|
Before=bee-network.service bee-nvidia.service bee-preflight.service bee-audit.service bee-web.service
|
||||||
|
StartLimitIntervalSec=0
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-blackbox.log /usr/local/bin/bee blackbox --export-dir /appdata/bee/export --state-file /appdata/bee/export/blackbox-state.json
|
||||||
|
Restart=always
|
||||||
|
RestartSec=1
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
OOMScoreAdjust=-900
|
||||||
|
Nice=0
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: bring up network interfaces via DHCP
|
Description=Bee: bring up network interfaces via DHCP
|
||||||
After=local-fs.target
|
After=local-fs.target bee-blackbox.service
|
||||||
Before=network-online.target bee-audit.service
|
Before=network-online.target bee-audit.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: load NVIDIA kernel modules and create device nodes
|
Description=Bee: load NVIDIA kernel modules and create device nodes
|
||||||
After=local-fs.target udev.service
|
After=local-fs.target udev.service bee-blackbox.service
|
||||||
Before=bee-audit.service
|
Before=bee-audit.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: runtime preflight self-check
|
Description=Bee: runtime preflight self-check
|
||||||
After=bee-network.service bee-nvidia.service
|
After=bee-network.service bee-nvidia.service bee-blackbox.service
|
||||||
Before=bee-audit.service
|
Before=bee-audit.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit web viewer
|
Description=Bee: hardware audit web viewer
|
||||||
|
After=bee-blackbox.service
|
||||||
StartLimitIntervalSec=0
|
StartLimitIntervalSec=0
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
326
iso/overlay/usr/local/bin/bee-nvidia-recover
Executable file
326
iso/overlay/usr/local/bin/bee-nvidia-recover
Executable file
@@ -0,0 +1,326 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
|
||||||
|
|
||||||
|
set -u
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[bee-nvidia-recover] $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_blocker() {
|
||||||
|
echo "[bee-nvidia-recover] blocker: $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<'EOF'
|
||||||
|
usage:
|
||||||
|
bee-nvidia-recover restart-drivers
|
||||||
|
bee-nvidia-recover reset-gpu <index>
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
unit_exists() {
|
||||||
|
systemctl cat "$1" >/dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
unit_is_active() {
|
||||||
|
systemctl is-active --quiet "$1" 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_unit_if_active() {
|
||||||
|
unit="$1"
|
||||||
|
if unit_is_active "$unit"; then
|
||||||
|
log "stopping $unit"
|
||||||
|
systemctl stop "$unit"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
start_unit_if_marked() {
|
||||||
|
unit="$1"
|
||||||
|
marker="$2"
|
||||||
|
if [ "$marker" = "1" ] && unit_exists "$unit"; then
|
||||||
|
log "starting $unit"
|
||||||
|
systemctl start "$unit"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_process_exit() {
|
||||||
|
name="$1"
|
||||||
|
tries=0
|
||||||
|
while pgrep -x "$name" >/dev/null 2>&1; do
|
||||||
|
tries=$((tries + 1))
|
||||||
|
if [ "$tries" -ge 15 ]; then
|
||||||
|
log "WARN: $name is still running after stop request"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
log_pid_details() {
|
||||||
|
pid="$1"
|
||||||
|
line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
|
||||||
|
if [ -n "$line" ]; then
|
||||||
|
log_blocker "$line"
|
||||||
|
else
|
||||||
|
log_blocker "pid $pid"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
collect_gpu_compute_pids() {
|
||||||
|
index="$1"
|
||||||
|
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
nvidia-smi --id="$index" \
|
||||||
|
--query-compute-apps=pid \
|
||||||
|
--format=csv,noheader,nounits 2>/dev/null \
|
||||||
|
| sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
|
||||||
|
| grep -E '^[0-9]+$' || true
|
||||||
|
}
|
||||||
|
|
||||||
|
collect_gpu_device_pids() {
|
||||||
|
index="$1"
|
||||||
|
dev="/dev/nvidia$index"
|
||||||
|
[ -e "$dev" ] || return 0
|
||||||
|
if command -v fuser >/dev/null 2>&1; then
|
||||||
|
fuser "$dev" 2>/dev/null \
|
||||||
|
| tr ' ' '\n' \
|
||||||
|
| sed 's/[^0-9].*$//' \
|
||||||
|
| grep -E '^[0-9]+$' || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
collect_gpu_holder_pids() {
|
||||||
|
index="$1"
|
||||||
|
{
|
||||||
|
collect_gpu_compute_pids "$index"
|
||||||
|
collect_gpu_device_pids "$index"
|
||||||
|
} | awk 'NF' | sort -u
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_pid_list() {
|
||||||
|
pids="$1"
|
||||||
|
[ -n "$pids" ] || return 0
|
||||||
|
|
||||||
|
for pid in $pids; do
|
||||||
|
log_pid_details "$pid"
|
||||||
|
done
|
||||||
|
log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
|
||||||
|
for pid in $pids; do
|
||||||
|
kill -TERM "$pid" >/dev/null 2>&1 || true
|
||||||
|
done
|
||||||
|
sleep 1
|
||||||
|
for pid in $pids; do
|
||||||
|
if kill -0 "$pid" >/dev/null 2>&1; then
|
||||||
|
log "forcing GPU holder PID $pid to exit"
|
||||||
|
kill -KILL "$pid" >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
gpu_has_display_holders() {
|
||||||
|
index="$1"
|
||||||
|
holders=$(collect_gpu_device_pids "$index")
|
||||||
|
[ -n "$holders" ] || return 1
|
||||||
|
for pid in $holders; do
|
||||||
|
comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
||||||
|
case "$comm" in
|
||||||
|
Xorg|Xwayland|X|gnome-shell)
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_nv_hostengine_if_running() {
|
||||||
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||||
|
[ -n "$line" ] || continue
|
||||||
|
log_blocker "$line"
|
||||||
|
done
|
||||||
|
log "stopping nv-hostengine"
|
||||||
|
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
hostengine_was_active=1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_fabricmanager_if_active() {
|
||||||
|
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||||
|
log_blocker "service nvidia-fabricmanager.service"
|
||||||
|
fabric_was_active=1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_display_stack_if_active() {
|
||||||
|
stopped=1
|
||||||
|
for unit in display-manager.service lightdm.service; do
|
||||||
|
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||||
|
log_blocker "service $unit"
|
||||||
|
display_was_active=1
|
||||||
|
stopped=0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return "$stopped"
|
||||||
|
}
|
||||||
|
|
||||||
|
try_gpu_reset() {
|
||||||
|
index="$1"
|
||||||
|
log "resetting GPU $index"
|
||||||
|
nvidia-smi -r -i "$index"
|
||||||
|
}
|
||||||
|
|
||||||
|
drain_gpu_clients() {
|
||||||
|
display_was_active=0
|
||||||
|
fabric_was_active=0
|
||||||
|
hostengine_was_active=0
|
||||||
|
|
||||||
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||||
|
[ -n "$line" ] || continue
|
||||||
|
log_blocker "$line"
|
||||||
|
done
|
||||||
|
log "stopping nv-hostengine"
|
||||||
|
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
hostengine_was_active=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||||
|
log_blocker "service nvidia-fabricmanager.service"
|
||||||
|
fabric_was_active=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for unit in display-manager.service lightdm.service; do
|
||||||
|
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||||
|
log_blocker "service $unit"
|
||||||
|
display_was_active=1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
for dev in /dev/nvidia[0-9]*; do
|
||||||
|
[ -e "$dev" ] || continue
|
||||||
|
holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
restore_gpu_clients() {
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||||
|
log "enabled NVIDIA persistence mode"
|
||||||
|
else
|
||||||
|
log "WARN: failed to enable NVIDIA persistence mode"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
log "starting nv-hostengine"
|
||||||
|
nv-hostengine
|
||||||
|
fi
|
||||||
|
|
||||||
|
start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
|
||||||
|
start_unit_if_marked display-manager.service "${display_was_active:-0}"
|
||||||
|
if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
|
||||||
|
start_unit_if_marked lightdm.service "1"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
restart_drivers() {
|
||||||
|
drain_gpu_clients
|
||||||
|
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
||||||
|
if lsmod | awk '{print $1}' | grep -qx "$mod"; then
|
||||||
|
log "unloading module $mod"
|
||||||
|
rmmod "$mod"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
|
||||||
|
log "reloading NVIDIA driver stack"
|
||||||
|
/usr/local/bin/bee-nvidia-load
|
||||||
|
restore_gpu_clients
|
||||||
|
}
|
||||||
|
|
||||||
|
reset_gpu() {
|
||||||
|
index="$1"
|
||||||
|
display_was_active=0
|
||||||
|
fabric_was_active=0
|
||||||
|
hostengine_was_active=0
|
||||||
|
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
stop_nv_hostengine_if_running || true
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
stop_fabricmanager_if_active || true
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if gpu_has_display_holders "$index"; then
|
||||||
|
stop_display_stack_if_active || true
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
log "GPU $index still has holders after targeted drain"
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
try_gpu_reset "$index"
|
||||||
|
rc=$?
|
||||||
|
restore_gpu_clients
|
||||||
|
return "$rc"
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd="${1:-}"
|
||||||
|
case "$cmd" in
|
||||||
|
restart-drivers)
|
||||||
|
restart_drivers
|
||||||
|
;;
|
||||||
|
reset-gpu)
|
||||||
|
if [ "$#" -ne 2 ]; then
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
reset_gpu "$2"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
BIN
audit/bee → iso/vendor/arcconf
vendored
BIN
audit/bee → iso/vendor/arcconf
vendored
Binary file not shown.
BIN
iso/vendor/sas2ircu
vendored
Executable file
BIN
iso/vendor/sas2ircu
vendored
Executable file
Binary file not shown.
BIN
iso/vendor/sas3ircu
vendored
Executable file
BIN
iso/vendor/sas3ircu
vendored
Executable file
Binary file not shown.
BIN
iso/vendor/ssacli
vendored
Executable file
BIN
iso/vendor/ssacli
vendored
Executable file
Binary file not shown.
BIN
iso/vendor/storcli64
vendored
Executable file
BIN
iso/vendor/storcli64
vendored
Executable file
Binary file not shown.
@@ -47,6 +47,13 @@ echo "==> Сборка бинарника..."
|
|||||||
)
|
)
|
||||||
echo " OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
|
echo " OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
|
||||||
|
|
||||||
|
LOCAL_SHA="$(shasum -a 256 "${LOCAL_BIN}" | awk '{print $1}')"
|
||||||
|
REMOTE_SHA="$("${SSH_CMD[@]}" "$REMOTE" "if [ -f '${REMOTE_BIN}' ] && command -v sha256sum >/dev/null 2>&1; then sha256sum '${REMOTE_BIN}' | awk '{print \\$1}'; fi" 2>/dev/null || true)"
|
||||||
|
if [[ -n "${REMOTE_SHA}" && "${LOCAL_SHA}" == "${REMOTE_SHA}" ]]; then
|
||||||
|
echo "==> Бинарник не изменился (${LOCAL_SHA}); копирование и перезапуск сервисов пропущены."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
# --- Deploy ---
|
# --- Deploy ---
|
||||||
echo "==> Копирование на ${REMOTE}..."
|
echo "==> Копирование на ${REMOTE}..."
|
||||||
"${SCP_CMD[@]}" "${LOCAL_BIN}" "${REMOTE}:/tmp/bee-new"
|
"${SCP_CMD[@]}" "${LOCAL_BIN}" "${REMOTE}:/tmp/bee-new"
|
||||||
|
|||||||
@@ -1,74 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# fetch-vendor.sh — download proprietary vendor utilities into iso/vendor.
|
|
||||||
#
|
|
||||||
# Usage:
|
|
||||||
# STORCLI_URL=... STORCLI_SHA256=... \
|
|
||||||
# SAS2IRCU_URL=... SAS2IRCU_SHA256=... \
|
|
||||||
# SAS3IRCU_URL=... SAS3IRCU_SHA256=... \
|
|
||||||
# MSTFLINT_URL=... MSTFLINT_SHA256=... \
|
|
||||||
# sh scripts/fetch-vendor.sh
|
|
||||||
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
ROOT_DIR=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)
|
|
||||||
OUT_DIR="$ROOT_DIR/iso/vendor"
|
|
||||||
mkdir -p "$OUT_DIR"
|
|
||||||
|
|
||||||
need_cmd() {
|
|
||||||
command -v "$1" >/dev/null 2>&1 || { echo "ERROR: required command not found: $1" >&2; exit 1; }
|
|
||||||
}
|
|
||||||
|
|
||||||
need_cmd sha256sum
|
|
||||||
|
|
||||||
download_to() {
|
|
||||||
url="$1"
|
|
||||||
out="$2"
|
|
||||||
if command -v wget >/dev/null 2>&1; then
|
|
||||||
wget -O "$out" "$url"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
if command -v curl >/dev/null 2>&1; then
|
|
||||||
curl -fsSL "$url" -o "$out"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo "ERROR: required command not found: wget or curl" >&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
fetch_one() {
|
|
||||||
name="$1"
|
|
||||||
url="$2"
|
|
||||||
sha="$3"
|
|
||||||
|
|
||||||
if [ -z "$url" ] || [ -z "$sha" ]; then
|
|
||||||
echo "[vendor] skip $name (URL/SHA not provided)"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
dst="$OUT_DIR/$name"
|
|
||||||
tmp="$dst.tmp"
|
|
||||||
|
|
||||||
echo "[vendor] downloading $name"
|
|
||||||
download_to "$url" "$tmp"
|
|
||||||
|
|
||||||
got=$(sha256sum "$tmp" | awk '{print $1}')
|
|
||||||
want=$(echo "$sha" | tr '[:upper:]' '[:lower:]')
|
|
||||||
if [ "$got" != "$want" ]; then
|
|
||||||
rm -f "$tmp"
|
|
||||||
echo "ERROR: checksum mismatch for $name" >&2
|
|
||||||
echo " got: $got" >&2
|
|
||||||
echo " want: $want" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
mv "$tmp" "$dst"
|
|
||||||
chmod +x "$dst" || true
|
|
||||||
echo "[vendor] ok: $name"
|
|
||||||
}
|
|
||||||
|
|
||||||
fetch_one "storcli64" "${STORCLI_URL:-}" "${STORCLI_SHA256:-}"
|
|
||||||
fetch_one "sas2ircu" "${SAS2IRCU_URL:-}" "${SAS2IRCU_SHA256:-}"
|
|
||||||
fetch_one "sas3ircu" "${SAS3IRCU_URL:-}" "${SAS3IRCU_SHA256:-}"
|
|
||||||
fetch_one "mstflint" "${MSTFLINT_URL:-}" "${MSTFLINT_SHA256:-}"
|
|
||||||
|
|
||||||
echo "[vendor] done. output dir: $OUT_DIR"
|
|
||||||
Reference in New Issue
Block a user