Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 29179917c3 | |||
| be4b439804 | |||
| 749fc8a94d | |||
| 6112094d45 | |||
| e9a2bc9f9d |
@@ -2,6 +2,7 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@@ -67,10 +68,14 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
|||||||
return runSupportBundle(args[1:], stdout, stderr)
|
return runSupportBundle(args[1:], stdout, stderr)
|
||||||
case "web":
|
case "web":
|
||||||
return runWeb(args[1:], stdout, stderr)
|
return runWeb(args[1:], stdout, stderr)
|
||||||
|
case "blackbox":
|
||||||
|
return runBlackbox(args[1:], stdout, stderr)
|
||||||
case "sat":
|
case "sat":
|
||||||
return runSAT(args[1:], stdout, stderr)
|
return runSAT(args[1:], stdout, stderr)
|
||||||
case "benchmark":
|
case "benchmark":
|
||||||
return runBenchmark(args[1:], stdout, stderr)
|
return runBenchmark(args[1:], stdout, stderr)
|
||||||
|
case "bee-worker":
|
||||||
|
return runBeeWorker(args[1:], stdout, stderr)
|
||||||
case "version", "--version", "-version":
|
case "version", "--version", "-version":
|
||||||
fmt.Fprintln(stdout, Version)
|
fmt.Fprintln(stdout, Version)
|
||||||
return 0
|
return 0
|
||||||
@@ -88,8 +93,10 @@ func printRootUsage(w io.Writer) {
|
|||||||
bee export --target <device>
|
bee export --target <device>
|
||||||
bee support-bundle --output stdout|file:<path>
|
bee support-bundle --output stdout|file:<path>
|
||||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||||
|
bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
|
||||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||||
bee benchmark nvidia [--profile standard|stability|overnight]
|
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||||
|
bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
|
||||||
bee version
|
bee version
|
||||||
bee help [command]`)
|
bee help [command]`)
|
||||||
}
|
}
|
||||||
@@ -106,10 +113,14 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
|||||||
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||||
case "web":
|
case "web":
|
||||||
return runWeb([]string{"--help"}, stdout, stdout)
|
return runWeb([]string{"--help"}, stdout, stdout)
|
||||||
|
case "blackbox":
|
||||||
|
return runBlackbox([]string{"--help"}, stdout, stdout)
|
||||||
case "sat":
|
case "sat":
|
||||||
return runSAT([]string{"--help"}, stdout, stderr)
|
return runSAT([]string{"--help"}, stdout, stderr)
|
||||||
case "benchmark":
|
case "benchmark":
|
||||||
return runBenchmark([]string{"--help"}, stdout, stderr)
|
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||||
|
case "bee-worker":
|
||||||
|
return runBeeWorker([]string{"--help"}, stdout, stderr)
|
||||||
case "version":
|
case "version":
|
||||||
fmt.Fprintln(stdout, "usage: bee version")
|
fmt.Fprintln(stdout, "usage: bee version")
|
||||||
return 0
|
return 0
|
||||||
@@ -335,6 +346,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runBlackbox(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||||
|
statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
|
||||||
|
if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
|
||||||
|
slog.Error("run blackbox", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func runSAT(args []string, stdout, stderr io.Writer) int {
|
func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||||
if len(args) == 0 {
|
if len(args) == 0 {
|
||||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||||
@@ -462,6 +500,28 @@ func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runBeeWorker(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
|
||||||
|
taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
|
||||||
|
}
|
||||||
|
|
||||||
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||||
raw = strings.TrimSpace(raw)
|
raw = strings.TrimSpace(raw)
|
||||||
if raw == "" {
|
if raw == "" {
|
||||||
|
|||||||
779
audit/internal/app/blackbox.go
Normal file
779
audit/internal/app/blackbox.go
Normal file
@@ -0,0 +1,779 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"crypto/rand"
|
||||||
|
"encoding/hex"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io/fs"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
blackboxMarkerName = ".bee-blackbox"
|
||||||
|
blackboxDiscoverInterval = 2 * time.Second
|
||||||
|
blackboxMinFlushPeriod = 1 * time.Second
|
||||||
|
blackboxMaxFlushPeriod = 30 * time.Second
|
||||||
|
blackboxRecoveryFastCount = 5
|
||||||
|
)
|
||||||
|
|
||||||
|
var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
|
||||||
|
|
||||||
|
var (
|
||||||
|
blackboxExecCommand = exec.Command
|
||||||
|
blackboxNow = func() time.Time { return time.Now().UTC() }
|
||||||
|
)
|
||||||
|
|
||||||
|
type BlackboxMarker struct {
|
||||||
|
Version int `json:"version"`
|
||||||
|
EnrollmentID string `json:"enrollment_id"`
|
||||||
|
CreatedAtUTC string `json:"created_at_utc"`
|
||||||
|
Host string `json:"host,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BlackboxTargetStatus struct {
|
||||||
|
EnrollmentID string `json:"enrollment_id"`
|
||||||
|
Device string `json:"device"`
|
||||||
|
FS platform.RemovableTarget `json:"fs"`
|
||||||
|
BootFolder string `json:"boot_folder"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
LastSyncAtUTC string `json:"last_sync_at_utc,omitempty"`
|
||||||
|
LastCycleDuration string `json:"last_cycle_duration,omitempty"`
|
||||||
|
FlushPeriod string `json:"flush_period"`
|
||||||
|
LastError string `json:"last_error,omitempty"`
|
||||||
|
Mountpoint string `json:"mountpoint,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BlackboxState struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
BootStartedAtUTC string `json:"boot_started_at_utc"`
|
||||||
|
BootFolder string `json:"boot_folder"`
|
||||||
|
UpdatedAtUTC string `json:"updated_at_utc"`
|
||||||
|
Targets []BlackboxTargetStatus `json:"targets"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type blackboxRuntime struct {
|
||||||
|
exportDir string
|
||||||
|
statePath string
|
||||||
|
system *platform.System
|
||||||
|
bootStarted time.Time
|
||||||
|
bootFolder string
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
workers map[string]*blackboxWorker
|
||||||
|
}
|
||||||
|
|
||||||
|
type discoveredBlackboxTarget struct {
|
||||||
|
marker BlackboxMarker
|
||||||
|
target platform.RemovableTarget
|
||||||
|
seenMount string
|
||||||
|
mountedByBee bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type blackboxWorker struct {
|
||||||
|
runtime *blackboxRuntime
|
||||||
|
enrollmentID string
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
target platform.RemovableTarget
|
||||||
|
marker BlackboxMarker
|
||||||
|
mountpoint string
|
||||||
|
mountedByBee bool
|
||||||
|
status string
|
||||||
|
lastSyncAt time.Time
|
||||||
|
lastDuration time.Duration
|
||||||
|
flushPeriod time.Duration
|
||||||
|
lastError string
|
||||||
|
fastCycles int
|
||||||
|
stopCh chan struct{}
|
||||||
|
stoppedCh chan struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
|
||||||
|
exportDir = strings.TrimSpace(exportDir)
|
||||||
|
if exportDir == "" {
|
||||||
|
exportDir = DefaultExportDir
|
||||||
|
}
|
||||||
|
statePath = strings.TrimSpace(statePath)
|
||||||
|
if statePath == "" {
|
||||||
|
statePath = DefaultBlackboxStatePath
|
||||||
|
}
|
||||||
|
if system == nil {
|
||||||
|
system = platform.New()
|
||||||
|
}
|
||||||
|
bootStarted, err := bootStartedAtUTC()
|
||||||
|
if err != nil {
|
||||||
|
bootStarted = blackboxNow()
|
||||||
|
}
|
||||||
|
rt := &blackboxRuntime{
|
||||||
|
exportDir: exportDir,
|
||||||
|
statePath: statePath,
|
||||||
|
system: system,
|
||||||
|
bootStarted: bootStarted,
|
||||||
|
bootFolder: SupportBundleBaseName(bootStarted),
|
||||||
|
workers: make(map[string]*blackboxWorker),
|
||||||
|
}
|
||||||
|
_ = os.MkdirAll(filepath.Dir(statePath), 0755)
|
||||||
|
rt.persistState()
|
||||||
|
ticker := time.NewTicker(blackboxDiscoverInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
rt.reconcile()
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
rt.stopAll()
|
||||||
|
return ctx.Err()
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ReadBlackboxState(path string) (BlackboxState, error) {
|
||||||
|
path = strings.TrimSpace(path)
|
||||||
|
if path == "" {
|
||||||
|
path = DefaultBlackboxStatePath
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return BlackboxState{}, err
|
||||||
|
}
|
||||||
|
var state BlackboxState
|
||||||
|
if err := json.Unmarshal(raw, &state); err != nil {
|
||||||
|
return BlackboxState{}, err
|
||||||
|
}
|
||||||
|
return state, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
|
||||||
|
target = sanitizeRemovableTarget(target)
|
||||||
|
if target.Device == "" {
|
||||||
|
return BlackboxMarker{}, fmt.Errorf("device is required")
|
||||||
|
}
|
||||||
|
mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
|
||||||
|
if err != nil {
|
||||||
|
return BlackboxMarker{}, err
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if mountedByBee {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
marker, _, err := readBlackboxMarker(mountpoint)
|
||||||
|
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
|
return BlackboxMarker{}, err
|
||||||
|
}
|
||||||
|
if marker.EnrollmentID == "" {
|
||||||
|
marker = BlackboxMarker{
|
||||||
|
Version: 1,
|
||||||
|
EnrollmentID: newBlackboxEnrollmentID(),
|
||||||
|
CreatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||||
|
Host: hostnameOr("unknown"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := writeBlackboxMarker(mountpoint, marker); err != nil {
|
||||||
|
return BlackboxMarker{}, err
|
||||||
|
}
|
||||||
|
return marker, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func DisableBlackboxTarget(device, enrollmentID string) error {
|
||||||
|
device = strings.TrimSpace(device)
|
||||||
|
enrollmentID = strings.TrimSpace(enrollmentID)
|
||||||
|
if device == "" && enrollmentID == "" {
|
||||||
|
return fmt.Errorf("device or enrollment_id is required")
|
||||||
|
}
|
||||||
|
system := platform.New()
|
||||||
|
targets, err := system.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, target := range targets {
|
||||||
|
target = sanitizeRemovableTarget(target)
|
||||||
|
mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
|
||||||
|
if mountErr != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
remove := false
|
||||||
|
marker, _, err := readBlackboxMarker(mountpoint)
|
||||||
|
if err == nil {
|
||||||
|
if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
|
||||||
|
remove = true
|
||||||
|
}
|
||||||
|
if device != "" && target.Device == device {
|
||||||
|
remove = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if remove {
|
||||||
|
err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
|
||||||
|
}
|
||||||
|
if mountedByBee {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
if remove {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return os.ErrNotExist
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) reconcile() {
|
||||||
|
discovered, _ := rt.discoverMarkedTargets()
|
||||||
|
|
||||||
|
rt.mu.Lock()
|
||||||
|
defer rt.mu.Unlock()
|
||||||
|
|
||||||
|
seen := make(map[string]struct{}, len(discovered))
|
||||||
|
for _, found := range discovered {
|
||||||
|
seen[found.marker.EnrollmentID] = struct{}{}
|
||||||
|
worker, ok := rt.workers[found.marker.EnrollmentID]
|
||||||
|
if !ok {
|
||||||
|
worker = newBlackboxWorker(rt, found)
|
||||||
|
rt.workers[found.marker.EnrollmentID] = worker
|
||||||
|
go worker.run()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
worker.update(found)
|
||||||
|
}
|
||||||
|
for id, worker := range rt.workers {
|
||||||
|
if _, ok := seen[id]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
worker.stop()
|
||||||
|
delete(rt.workers, id)
|
||||||
|
}
|
||||||
|
rt.persistStateLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) stopAll() {
|
||||||
|
rt.mu.Lock()
|
||||||
|
workers := make([]*blackboxWorker, 0, len(rt.workers))
|
||||||
|
for _, worker := range rt.workers {
|
||||||
|
workers = append(workers, worker)
|
||||||
|
}
|
||||||
|
rt.workers = map[string]*blackboxWorker{}
|
||||||
|
rt.persistStateLocked()
|
||||||
|
rt.mu.Unlock()
|
||||||
|
for _, worker := range workers {
|
||||||
|
worker.stop()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
|
||||||
|
targets, err := rt.system.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var out []discoveredBlackboxTarget
|
||||||
|
for _, rawTarget := range targets {
|
||||||
|
target := sanitizeRemovableTarget(rawTarget)
|
||||||
|
if target.Device == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
marker, ok, err := readBlackboxMarker(mountpoint)
|
||||||
|
if mountedByBee && !ok {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
if err != nil || !ok || marker.EnrollmentID == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if mountedByBee {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
out = append(out, discoveredBlackboxTarget{
|
||||||
|
marker: marker,
|
||||||
|
target: target,
|
||||||
|
seenMount: mountpoint,
|
||||||
|
mountedByBee: mountedByBee,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(out, func(i, j int) bool {
|
||||||
|
return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
|
||||||
|
})
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
|
||||||
|
return &blackboxWorker{
|
||||||
|
runtime: rt,
|
||||||
|
enrollmentID: found.marker.EnrollmentID,
|
||||||
|
target: found.target,
|
||||||
|
marker: found.marker,
|
||||||
|
flushPeriod: blackboxMinFlushPeriod,
|
||||||
|
status: "running",
|
||||||
|
stopCh: make(chan struct{}),
|
||||||
|
stoppedCh: make(chan struct{}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) run() {
|
||||||
|
defer close(w.stoppedCh)
|
||||||
|
for {
|
||||||
|
start := time.Now()
|
||||||
|
err := w.syncCycle()
|
||||||
|
duration := time.Since(start)
|
||||||
|
w.finishCycle(duration, err)
|
||||||
|
|
||||||
|
wait := w.currentFlushPeriod()
|
||||||
|
timer := time.NewTimer(wait)
|
||||||
|
select {
|
||||||
|
case <-w.stopCh:
|
||||||
|
timer.Stop()
|
||||||
|
w.cleanup()
|
||||||
|
return
|
||||||
|
case <-timer.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
w.target = found.target
|
||||||
|
w.marker = found.marker
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) stop() {
|
||||||
|
select {
|
||||||
|
case <-w.stopCh:
|
||||||
|
default:
|
||||||
|
close(w.stopCh)
|
||||||
|
}
|
||||||
|
<-w.stoppedCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
return w.flushPeriod
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
w.lastDuration = duration
|
||||||
|
if err != nil {
|
||||||
|
w.status = "degraded"
|
||||||
|
w.lastError = err.Error()
|
||||||
|
w.fastCycles = 0
|
||||||
|
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
|
||||||
|
} else {
|
||||||
|
w.status = "running"
|
||||||
|
w.lastSyncAt = blackboxNow()
|
||||||
|
w.lastError = ""
|
||||||
|
if duration <= w.flushPeriod/2 {
|
||||||
|
w.fastCycles++
|
||||||
|
} else {
|
||||||
|
w.fastCycles = 0
|
||||||
|
}
|
||||||
|
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||||
|
}
|
||||||
|
w.runtime.persistState()
|
||||||
|
}
|
||||||
|
|
||||||
|
func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
|
||||||
|
if current <= 0 {
|
||||||
|
current = blackboxMinFlushPeriod
|
||||||
|
}
|
||||||
|
if duration <= 0 {
|
||||||
|
duration = current
|
||||||
|
}
|
||||||
|
next := current
|
||||||
|
if duration > current {
|
||||||
|
growA := time.Duration(float64(current) * 1.25)
|
||||||
|
growB := time.Duration(float64(duration) * 1.25)
|
||||||
|
if growB > growA {
|
||||||
|
next = growB
|
||||||
|
} else {
|
||||||
|
next = growA
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if success && fastCycles >= blackboxRecoveryFastCount {
|
||||||
|
next = time.Duration(float64(current) * 0.9)
|
||||||
|
}
|
||||||
|
if next < blackboxMinFlushPeriod {
|
||||||
|
next = blackboxMinFlushPeriod
|
||||||
|
}
|
||||||
|
if next > blackboxMaxFlushPeriod {
|
||||||
|
next = blackboxMaxFlushPeriod
|
||||||
|
}
|
||||||
|
return next
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) syncCycle() error {
|
||||||
|
target, marker := w.snapshotTarget()
|
||||||
|
mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
w.recordMountpoint(mountpoint, mountedByBee)
|
||||||
|
|
||||||
|
root := filepath.Join(mountpoint, w.runtime.bootFolder)
|
||||||
|
if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := w.captureSnapshots(root); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return syncFilesystem(root)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) cleanup() {
|
||||||
|
w.mu.Lock()
|
||||||
|
mountpoint := w.mountpoint
|
||||||
|
mountedByBee := w.mountedByBee
|
||||||
|
w.mu.Unlock()
|
||||||
|
if mountedByBee && mountpoint != "" {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
return w.target, w.marker
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
w.mountpoint = mountpoint
|
||||||
|
w.mountedByBee = mountedByBee
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) captureSnapshots(root string) error {
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, svc := range supportBundleServices {
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, item := range supportBundleOptionalFiles {
|
||||||
|
if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) persistState() {
|
||||||
|
rt.mu.Lock()
|
||||||
|
defer rt.mu.Unlock()
|
||||||
|
rt.persistStateLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) persistStateLocked() {
|
||||||
|
state := BlackboxState{
|
||||||
|
Status: "disabled",
|
||||||
|
BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
|
||||||
|
BootFolder: rt.bootFolder,
|
||||||
|
UpdatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||||
|
Targets: make([]BlackboxTargetStatus, 0, len(rt.workers)),
|
||||||
|
}
|
||||||
|
if len(rt.workers) > 0 {
|
||||||
|
state.Status = "running"
|
||||||
|
}
|
||||||
|
for _, worker := range rt.workers {
|
||||||
|
worker.mu.Lock()
|
||||||
|
targetState := BlackboxTargetStatus{
|
||||||
|
EnrollmentID: worker.enrollmentID,
|
||||||
|
Device: worker.target.Device,
|
||||||
|
FS: worker.target,
|
||||||
|
BootFolder: rt.bootFolder,
|
||||||
|
Status: worker.status,
|
||||||
|
FlushPeriod: worker.flushPeriod.String(),
|
||||||
|
LastError: worker.lastError,
|
||||||
|
Mountpoint: worker.mountpoint,
|
||||||
|
}
|
||||||
|
if !worker.lastSyncAt.IsZero() {
|
||||||
|
targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
|
||||||
|
}
|
||||||
|
if worker.lastDuration > 0 {
|
||||||
|
targetState.LastCycleDuration = worker.lastDuration.String()
|
||||||
|
}
|
||||||
|
if worker.status == "degraded" {
|
||||||
|
state.Status = "degraded"
|
||||||
|
}
|
||||||
|
worker.mu.Unlock()
|
||||||
|
state.Targets = append(state.Targets, targetState)
|
||||||
|
}
|
||||||
|
sort.Slice(state.Targets, func(i, j int) bool {
|
||||||
|
return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
|
||||||
|
})
|
||||||
|
_ = writeJSONAtomic(rt.statePath, state)
|
||||||
|
}
|
||||||
|
|
||||||
|
func bootStartedAtUTC() (time.Time, error) {
|
||||||
|
raw, err := os.ReadFile("/proc/stat")
|
||||||
|
if err != nil {
|
||||||
|
return time.Time{}, err
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(line, "btime ") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Fields(line)
|
||||||
|
if len(parts) != 2 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
sec, err := time.ParseDuration(parts[1] + "s")
|
||||||
|
if err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
return time.Unix(int64(sec/time.Second), 0).UTC(), nil
|
||||||
|
}
|
||||||
|
return time.Time{}, fmt.Errorf("boot time not found")
|
||||||
|
}
|
||||||
|
|
||||||
|
func newBlackboxEnrollmentID() string {
|
||||||
|
var buf [8]byte
|
||||||
|
if _, err := rand.Read(buf[:]); err != nil {
|
||||||
|
return fmt.Sprintf("bb-%d", time.Now().UnixNano())
|
||||||
|
}
|
||||||
|
return "bb-" + hex.EncodeToString(buf[:])
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
|
||||||
|
target.Device = strings.TrimSpace(target.Device)
|
||||||
|
target.FSType = strings.TrimSpace(target.FSType)
|
||||||
|
target.Size = strings.TrimSpace(target.Size)
|
||||||
|
target.Label = strings.TrimSpace(target.Label)
|
||||||
|
target.Model = strings.TrimSpace(target.Model)
|
||||||
|
target.Mountpoint = strings.TrimSpace(target.Mountpoint)
|
||||||
|
return target
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
|
||||||
|
target = sanitizeRemovableTarget(target)
|
||||||
|
if target.Mountpoint != "" {
|
||||||
|
if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
|
||||||
|
return target.Mountpoint, false, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
|
||||||
|
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||||
|
return "", false, err
|
||||||
|
}
|
||||||
|
if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||||
|
return "", false, formatBlackboxMountTargetError(target, string(raw), err)
|
||||||
|
}
|
||||||
|
if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
return "", false, err
|
||||||
|
}
|
||||||
|
return mountpoint, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func unmountTarget(mountpoint string) error {
|
||||||
|
_ = blackboxExecCommand("sync").Run()
|
||||||
|
raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
msg := strings.TrimSpace(string(raw))
|
||||||
|
if msg == "" {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%s: %w", msg, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
|
||||||
|
raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
return BlackboxMarker{}, false, os.ErrNotExist
|
||||||
|
}
|
||||||
|
return BlackboxMarker{}, false, err
|
||||||
|
}
|
||||||
|
var marker BlackboxMarker
|
||||||
|
if err := json.Unmarshal(raw, &marker); err != nil {
|
||||||
|
return BlackboxMarker{}, false, err
|
||||||
|
}
|
||||||
|
return marker, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
|
||||||
|
if marker.Version == 0 {
|
||||||
|
marker.Version = 1
|
||||||
|
}
|
||||||
|
return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
|
||||||
|
}
|
||||||
|
|
||||||
|
func syncDirectoryTree(srcDir, dstDir string) error {
|
||||||
|
seen := make(map[string]struct{})
|
||||||
|
err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(srcDir, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel = filepath.Clean(rel)
|
||||||
|
if rel == "." {
|
||||||
|
seen["."] = struct{}{}
|
||||||
|
return os.MkdirAll(dstDir, 0755)
|
||||||
|
}
|
||||||
|
seen[rel] = struct{}{}
|
||||||
|
dstPath := filepath.Join(dstDir, rel)
|
||||||
|
if d.IsDir() {
|
||||||
|
info, err := d.Info()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.MkdirAll(dstPath, info.Mode().Perm())
|
||||||
|
}
|
||||||
|
return copyFileIfChanged(path, dstPath)
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return removeMissingPaths(dstDir, seen)
|
||||||
|
}
|
||||||
|
|
||||||
|
func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
|
||||||
|
return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(dstDir, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel = filepath.Clean(rel)
|
||||||
|
if rel == "." {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if _, ok := seen[rel]; ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return os.RemoveAll(path)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyFileIfChanged(src, dst string) error {
|
||||||
|
info, err := os.Stat(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
return os.MkdirAll(dst, info.Mode().Perm())
|
||||||
|
}
|
||||||
|
srcData, err := os.ReadFile(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return writeFileAtomic(dst, srcData, info.Mode().Perm())
|
||||||
|
}
|
||||||
|
|
||||||
|
func captureCommandAtomic(dst string, name string, args ...string) error {
|
||||||
|
raw, err := blackboxExecCommand(name, args...).CombinedOutput()
|
||||||
|
if len(raw) == 0 {
|
||||||
|
if err != nil {
|
||||||
|
raw = []byte(err.Error() + "\n")
|
||||||
|
} else {
|
||||||
|
raw = []byte("no output\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return writeFileAtomic(dst, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeJSONAtomic(path string, v any) error {
|
||||||
|
raw, err := json.MarshalIndent(v, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
raw = append(raw, '\n')
|
||||||
|
return writeFileAtomic(path, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
tmp := path + ".tmp"
|
||||||
|
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if _, err := f.Write(data); err != nil {
|
||||||
|
_ = f.Close()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := f.Sync(); err != nil {
|
||||||
|
_ = f.Close()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := f.Close(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := os.Rename(tmp, path); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return syncFilesystem(filepath.Dir(path))
|
||||||
|
}
|
||||||
|
|
||||||
|
func syncFilesystem(path string) error {
|
||||||
|
return blackboxExecCommand("sync").Run()
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureWritableBlackboxMountpoint(mountpoint string) error {
|
||||||
|
probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||||
|
}
|
||||||
|
name := probe.Name()
|
||||||
|
if closeErr := probe.Close(); closeErr != nil {
|
||||||
|
_ = os.Remove(name)
|
||||||
|
return closeErr
|
||||||
|
}
|
||||||
|
if err := os.Remove(name); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
|
||||||
|
msg := strings.TrimSpace(raw)
|
||||||
|
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||||
|
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||||
|
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||||
|
}
|
||||||
|
if msg == "" {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%s: %w", msg, err)
|
||||||
|
}
|
||||||
52
audit/internal/app/blackbox_test.go
Normal file
52
audit/internal/app/blackbox_test.go
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
|
||||||
|
current := 2 * time.Second
|
||||||
|
got := adjustFlushPeriod(current, 4*time.Second, false, 0)
|
||||||
|
if got <= current {
|
||||||
|
t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
|
||||||
|
current := 10 * time.Second
|
||||||
|
got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
|
||||||
|
if got >= current {
|
||||||
|
t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
|
||||||
|
}
|
||||||
|
if got < blackboxMinFlushPeriod {
|
||||||
|
t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadBlackboxState(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "blackbox-state.json")
|
||||||
|
want := BlackboxState{
|
||||||
|
Status: "running",
|
||||||
|
BootStartedAtUTC: "2026-04-24T00:00:00Z",
|
||||||
|
BootFolder: "boot-folder",
|
||||||
|
UpdatedAtUTC: "2026-04-24T00:00:01Z",
|
||||||
|
Targets: []BlackboxTargetStatus{{
|
||||||
|
EnrollmentID: "bb-1",
|
||||||
|
Device: "/dev/sdb1",
|
||||||
|
Status: "running",
|
||||||
|
FlushPeriod: "1s",
|
||||||
|
}},
|
||||||
|
}
|
||||||
|
if err := writeJSONAtomic(path, want); err != nil {
|
||||||
|
t.Fatalf("writeJSONAtomic: %v", err)
|
||||||
|
}
|
||||||
|
got, err := ReadBlackboxState(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadBlackboxState: %v", err)
|
||||||
|
}
|
||||||
|
if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
|
||||||
|
t.Fatalf("state=%+v", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -15,6 +15,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var supportBundleServices = []string{
|
var supportBundleServices = []string{
|
||||||
|
"bee-blackbox.service",
|
||||||
"bee-audit.service",
|
"bee-audit.service",
|
||||||
"bee-web.service",
|
"bee-web.service",
|
||||||
"bee-network.service",
|
"bee-network.service",
|
||||||
@@ -256,11 +257,6 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
now := time.Now().UTC()
|
now := time.Now().UTC()
|
||||||
date := now.Format("2006-01-02")
|
|
||||||
tod := now.Format("150405")
|
|
||||||
ver := bundleVersion()
|
|
||||||
model := serverModelForBundle()
|
|
||||||
sn := serverSerialForBundle()
|
|
||||||
|
|
||||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||||
@@ -294,7 +290,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
archiveName := SupportBundleBaseName(now) + ".tar.gz"
|
||||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
@@ -302,6 +298,16 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return archivePath, nil
|
return archivePath, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SupportBundleBaseName(at time.Time) string {
|
||||||
|
at = at.UTC()
|
||||||
|
date := at.Format("2006-01-02")
|
||||||
|
tod := at.Format("150405")
|
||||||
|
ver := bundleVersion()
|
||||||
|
model := serverModelForBundle()
|
||||||
|
sn := serverSerialForBundle()
|
||||||
|
return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
|
||||||
|
}
|
||||||
|
|
||||||
func LatestSupportBundlePath() (string, error) {
|
func LatestSupportBundlePath() (string, error) {
|
||||||
return latestSupportBundlePath(os.TempDir())
|
return latestSupportBundlePath(os.TempDir())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -105,6 +105,7 @@ var (
|
|||||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||||
benchmarkGeteuid = os.Geteuid
|
benchmarkGeteuid = os.Geteuid
|
||||||
|
benchmarkResetNvidiaGPU = resetNvidiaGPU
|
||||||
benchmarkSleep = time.Sleep
|
benchmarkSleep = time.Sleep
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -249,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
|
||||||
|
}
|
||||||
|
out, err := benchmarkResetNvidiaGPU(gpuIndex)
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||||
|
"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
|
||||||
|
)
|
||||||
|
if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
|
||||||
|
for _, line := range strings.Split(trimmed, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line != "" {
|
||||||
|
logFunc(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rc := 0
|
||||||
|
if err != nil {
|
||||||
|
rc = 1
|
||||||
|
}
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||||
|
fmt.Sprintf("rc: %d", rc),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||||
if len(gpuIndices) == 0 {
|
if len(gpuIndices) == 0 {
|
||||||
return nil
|
return nil
|
||||||
@@ -266,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int
|
|||||||
}
|
}
|
||||||
var failed []int
|
var failed []int
|
||||||
for _, idx := range gpuIndices {
|
for _, idx := range gpuIndices {
|
||||||
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
|
||||||
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
|
||||||
failed = append(failed, idx)
|
failed = append(failed, idx)
|
||||||
if logFunc != nil {
|
if logFunc != nil {
|
||||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||||
@@ -4440,8 +4469,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||||
result.Findings = append(result.Findings,
|
return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
|
||||||
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
|
||||||
}
|
}
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
singlePowerStopCh := make(chan struct{})
|
singlePowerStopCh := make(chan struct{})
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"os"
|
"fmt"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -188,18 +188,16 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
oldGeteuid := benchmarkGeteuid
|
oldGeteuid := benchmarkGeteuid
|
||||||
oldExec := satExecCommand
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
benchmarkGeteuid = func() int { return 1000 }
|
benchmarkGeteuid = func() int { return 1000 }
|
||||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
benchmarkResetNvidiaGPU = func(int) (string, error) {
|
||||||
t.Fatalf("unexpected command: %s %v", name, args)
|
t.Fatal("unexpected reset call")
|
||||||
return nil
|
return "", nil
|
||||||
}
|
}
|
||||||
t.Cleanup(func() {
|
t.Cleanup(func() {
|
||||||
benchmarkGeteuid = oldGeteuid
|
benchmarkGeteuid = oldGeteuid
|
||||||
satExecCommand = oldExec
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
})
|
})
|
||||||
|
|
||||||
var logs []string
|
var logs []string
|
||||||
@@ -215,44 +213,52 @@ func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
dir := t.TempDir()
|
|
||||||
script := filepath.Join(dir, "nvidia-smi")
|
|
||||||
argsLog := filepath.Join(dir, "args.log")
|
|
||||||
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
|
||||||
t.Fatalf("write script: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
oldGeteuid := benchmarkGeteuid
|
oldGeteuid := benchmarkGeteuid
|
||||||
oldSleep := benchmarkSleep
|
oldSleep := benchmarkSleep
|
||||||
oldLookPath := satLookPath
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
benchmarkGeteuid = func() int { return 0 }
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
benchmarkSleep = func(time.Duration) {}
|
benchmarkSleep = func(time.Duration) {}
|
||||||
satLookPath = func(file string) (string, error) {
|
var calls []int
|
||||||
if file == "nvidia-smi" {
|
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||||
return script, nil
|
calls = append(calls, index)
|
||||||
}
|
return "ok\n", nil
|
||||||
return exec.LookPath(file)
|
|
||||||
}
|
}
|
||||||
t.Cleanup(func() {
|
t.Cleanup(func() {
|
||||||
benchmarkGeteuid = oldGeteuid
|
benchmarkGeteuid = oldGeteuid
|
||||||
benchmarkSleep = oldSleep
|
benchmarkSleep = oldSleep
|
||||||
satLookPath = oldLookPath
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
})
|
})
|
||||||
|
|
||||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||||
if len(failed) != 0 {
|
if len(failed) != 0 {
|
||||||
t.Fatalf("failed=%v want no failures", failed)
|
t.Fatalf("failed=%v want no failures", failed)
|
||||||
}
|
}
|
||||||
raw, err := os.ReadFile(argsLog)
|
if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
|
||||||
if err != nil {
|
t.Fatalf("calls=%v want %s", calls, want)
|
||||||
t.Fatalf("read args log: %v", err)
|
|
||||||
}
|
}
|
||||||
got := strings.Fields(string(raw))
|
}
|
||||||
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
|
||||||
if strings.Join(got, " ") != strings.Join(want, " ") {
|
func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
|
||||||
t.Fatalf("args=%v want %v", got, want)
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldSleep := benchmarkSleep
|
||||||
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
|
benchmarkSleep = func(time.Duration) {}
|
||||||
|
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||||
|
if index == 5 {
|
||||||
|
return "busy\n", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
return "ok\n", nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkSleep = oldSleep
|
||||||
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
|
})
|
||||||
|
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||||
|
if got, want := fmt.Sprint(failed), "[5]"; got != want {
|
||||||
|
t.Fatalf("failed=%v want %s", failed, want)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ var workerPatterns = []string{
|
|||||||
"stress-ng",
|
"stress-ng",
|
||||||
"stressapptest",
|
"stressapptest",
|
||||||
"memtester",
|
"memtester",
|
||||||
|
"nvbandwidth",
|
||||||
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||||
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||||
"nvvs",
|
"nvvs",
|
||||||
@@ -71,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
|
|||||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||||
base = exe[idx+1:]
|
base = exe[idx+1:]
|
||||||
}
|
}
|
||||||
for _, pat := range workerPatterns {
|
if shouldKillWorkerProcess(exe, base) {
|
||||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return killed
|
return killed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func shouldKillWorkerProcess(exe, base string) bool {
|
||||||
|
for _, pat := range workerPatterns {
|
||||||
|
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|||||||
39
audit/internal/platform/kill_workers_test.go
Normal file
39
audit/internal/platform/kill_workers_test.go
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestShouldKillWorkerProcess(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
exe string
|
||||||
|
base string
|
||||||
|
want bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "nvbandwidth executable",
|
||||||
|
exe: "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
|
||||||
|
base: "nvbandwidth",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "dcgmi executable",
|
||||||
|
exe: "/usr/bin/dcgmi",
|
||||||
|
base: "dcgmi",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unrelated process",
|
||||||
|
exe: "/usr/bin/bash",
|
||||||
|
base: "bash",
|
||||||
|
want: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
|
||||||
|
t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,6 +3,8 @@ package platform
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -28,3 +30,22 @@ func runNvidiaRecover(args ...string) (string, error) {
|
|||||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||||
return string(raw), err
|
return string(raw), err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resetNvidiaGPU(index int) (string, error) {
|
||||||
|
if index < 0 {
|
||||||
|
return "", fmt.Errorf("gpu index must be >= 0")
|
||||||
|
}
|
||||||
|
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||||
|
if strings.TrimSpace(out) == "" && err == nil {
|
||||||
|
out = "GPU reset completed.\n"
|
||||||
|
}
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func restartNvidiaDrivers() (string, error) {
|
||||||
|
out, err := runNvidiaRecover("restart-drivers")
|
||||||
|
if strings.TrimSpace(out) == "" && err == nil {
|
||||||
|
out = "NVIDIA drivers restarted.\n"
|
||||||
|
}
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
|||||||
@@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||||
if index < 0 {
|
return resetNvidiaGPU(index)
|
||||||
return "", fmt.Errorf("gpu index must be >= 0")
|
|
||||||
}
|
|
||||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
|
||||||
if strings.TrimSpace(out) == "" && err == nil {
|
|
||||||
out = "GPU reset completed.\n"
|
|
||||||
}
|
|
||||||
return out, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ func (s *System) ServiceState(name string) string {
|
|||||||
|
|
||||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
if name == "bee-nvidia" && action == ServiceRestart {
|
if name == "bee-nvidia" && action == ServiceRestart {
|
||||||
return runNvidiaRecover("restart-drivers")
|
return restartNvidiaDrivers()
|
||||||
}
|
}
|
||||||
// bee-web runs as the bee user; sudo is required to control system services.
|
// bee-web runs as the bee user; sudo is required to control system services.
|
||||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||||
|
|||||||
@@ -806,15 +806,14 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job == nil || !t.job.abort() {
|
||||||
t.job.abort()
|
globalQueue.mu.Unlock()
|
||||||
|
writeJSON(w, map[string]string{"status": "not_running"})
|
||||||
|
return
|
||||||
}
|
}
|
||||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
globalQueue.mu.Unlock()
|
||||||
platform.KillTestWorkers()
|
writeJSON(w, map[string]string{"status": "aborting"})
|
||||||
}
|
return
|
||||||
t.Status = TaskCancelled
|
|
||||||
now := time.Now()
|
|
||||||
t.DoneAt = &now
|
|
||||||
}
|
}
|
||||||
globalQueue.mu.Unlock()
|
globalQueue.mu.Unlock()
|
||||||
writeJSON(w, map[string]string{"status": "aborted"})
|
writeJSON(w, map[string]string{"status": "aborted"})
|
||||||
@@ -1039,6 +1038,81 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
|
|||||||
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBlackboxStatus(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
state, err := app.ReadBlackboxState(filepath.Join(h.opts.ExportDir, "blackbox-state.json"))
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
writeJSON(w, app.BlackboxState{Status: "disabled", Targets: []app.BlackboxTargetStatus{}})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if state.Targets == nil {
|
||||||
|
state.Targets = []app.BlackboxTargetStatus{}
|
||||||
|
}
|
||||||
|
writeJSON(w, state)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBlackboxEnable(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var target platform.RemovableTarget
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || strings.TrimSpace(target.Device) == "" {
|
||||||
|
writeError(w, http.StatusBadRequest, "device is required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
targets, err := h.opts.App.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
allowed := false
|
||||||
|
for _, candidate := range targets {
|
||||||
|
if candidate.Device == target.Device {
|
||||||
|
target = candidate
|
||||||
|
allowed = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !allowed {
|
||||||
|
writeError(w, http.StatusBadRequest, "device not in removable target list")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
marker, err := app.EnableBlackboxTarget(target)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]any{
|
||||||
|
"status": "ok",
|
||||||
|
"message": "Black-box marker written.",
|
||||||
|
"enrollment_id": marker.EnrollmentID,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBlackboxDisable(w http.ResponseWriter, r *http.Request) {
|
||||||
|
var req struct {
|
||||||
|
Device string `json:"device"`
|
||||||
|
EnrollmentID string `json:"enrollment_id"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := app.DisableBlackboxTarget(req.Device, req.EnrollmentID); err != nil {
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
writeError(w, http.StatusNotFound, "black-box target not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": "ok", "message": "Black-box marker removed."})
|
||||||
|
}
|
||||||
|
|
||||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ package webui
|
|||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
@@ -44,6 +46,45 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||||
|
|
||||||
|
h.handleAPIBlackboxStatus(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
var state app.BlackboxState
|
||||||
|
if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
|
||||||
|
t.Fatalf("decode state: %v", err)
|
||||||
|
}
|
||||||
|
if state.Status != "disabled" {
|
||||||
|
t.Fatalf("status=%q want disabled", state.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
|
||||||
|
exportDir := t.TempDir()
|
||||||
|
statePath := filepath.Join(exportDir, "blackbox-state.json")
|
||||||
|
if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
|
||||||
|
t.Fatalf("write state: %v", err)
|
||||||
|
}
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||||
|
|
||||||
|
h.handleAPIBlackboxStatus(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
|
||||||
|
t.Fatalf("body=%s", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||||
globalQueue.mu.Lock()
|
globalQueue.mu.Lock()
|
||||||
originalTasks := globalQueue.tasks
|
originalTasks := globalQueue.tasks
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ type jobState struct {
|
|||||||
cancel func() // optional cancel function; nil if job is not cancellable
|
cancel func() // optional cancel function; nil if job is not cancellable
|
||||||
logPath string
|
logPath string
|
||||||
serialPrefix string
|
serialPrefix string
|
||||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||||
logBuf *bufio.Writer
|
logBuf *bufio.Writer
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,13 +53,21 @@ func (j *jobState) abort() bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (j *jobState) append(line string) {
|
func (j *jobState) append(line string) {
|
||||||
|
j.appendWithOptions(line, true, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *jobState) appendFromLog(line string) {
|
||||||
|
j.appendWithOptions(line, false, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
|
||||||
j.mu.Lock()
|
j.mu.Lock()
|
||||||
defer j.mu.Unlock()
|
defer j.mu.Unlock()
|
||||||
j.lines = append(j.lines, line)
|
j.lines = append(j.lines, line)
|
||||||
if j.logPath != "" {
|
if persistLog && j.logPath != "" {
|
||||||
j.writeLogLineLocked(line)
|
j.writeLogLineLocked(line)
|
||||||
}
|
}
|
||||||
if j.serialPrefix != "" {
|
if serialMirror && j.serialPrefix != "" {
|
||||||
taskSerialWriteLine(j.serialPrefix + line)
|
taskSerialWriteLine(j.serialPrefix + line)
|
||||||
}
|
}
|
||||||
for _, ch := range j.subs {
|
for _, ch := range j.subs {
|
||||||
|
|||||||
@@ -102,47 +102,69 @@ window.supportBundleDownload = function() {
|
|||||||
|
|
||||||
func renderUSBExportCard() string {
|
func renderUSBExportCard() string {
|
||||||
return `<div class="card" style="margin-top:16px">
|
return `<div class="card" style="margin-top:16px">
|
||||||
<div class="card-head">Export to USB
|
<div class="card-head">USB Black-Box
|
||||||
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
<button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="card-body">` + renderUSBExportInline() + `</div>
|
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||||
</div>`
|
</div>`
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderUSBExportInline() string {
|
func renderUSBExportInline() string {
|
||||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
|
||||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||||
|
<div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
|
||||||
<div id="usb-targets" style="margin-top:12px"></div>
|
<div id="usb-targets" style="margin-top:12px"></div>
|
||||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||||
<script>
|
<script>
|
||||||
(function(){
|
(function(){
|
||||||
function usbRefresh() {
|
function blackboxRefresh() {
|
||||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||||
|
document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
|
||||||
document.getElementById('usb-targets').innerHTML = '';
|
document.getElementById('usb-targets').innerHTML = '';
|
||||||
document.getElementById('usb-msg').textContent = '';
|
document.getElementById('usb-msg').textContent = '';
|
||||||
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
Promise.all([
|
||||||
window._usbTargets = Array.isArray(targets) ? targets : [];
|
fetch('/api/export/usb').then(r=>r.json()),
|
||||||
|
fetch('/api/blackbox/status').then(r=>r.json())
|
||||||
|
]).then(function(values) {
|
||||||
|
const targets = Array.isArray(values[0]) ? values[0] : [];
|
||||||
|
const state = values[1] || {};
|
||||||
|
const active = Array.isArray(state.targets) ? state.targets : [];
|
||||||
|
window._usbTargets = targets;
|
||||||
|
window._blackboxTargets = active;
|
||||||
const st = document.getElementById('usb-status');
|
const st = document.getElementById('usb-status');
|
||||||
const ct = document.getElementById('usb-targets');
|
const ct = document.getElementById('usb-targets');
|
||||||
|
const summary = document.getElementById('blackbox-summary');
|
||||||
|
if (state.boot_folder) {
|
||||||
|
summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
|
||||||
|
} else {
|
||||||
|
summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
|
||||||
|
}
|
||||||
if (!targets || targets.length === 0) {
|
if (!targets || targets.length === 0) {
|
||||||
st.textContent = 'No removable USB devices found.';
|
st.textContent = 'No removable USB devices found.';
|
||||||
return;
|
} else {
|
||||||
|
st.textContent = targets.length + ' device(s) found:';
|
||||||
}
|
}
|
||||||
st.textContent = targets.length + ' device(s) found:';
|
const byDevice = {};
|
||||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
active.forEach(function(item) { byDevice[item.device] = item; });
|
||||||
|
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
|
||||||
targets.map((t, idx) => {
|
targets.map((t, idx) => {
|
||||||
const dev = t.device || '';
|
const dev = t.device || '';
|
||||||
const label = t.label || '';
|
const label = t.label || '';
|
||||||
const model = t.model || '';
|
const model = t.model || '';
|
||||||
|
const state = byDevice[dev];
|
||||||
|
const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
|
||||||
|
const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
|
||||||
return '<tr>' +
|
return '<tr>' +
|
||||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||||
'<td>'+t.fs_type+'</td>' +
|
'<td>'+t.fs_type+'</td>' +
|
||||||
'<td>'+t.size+'</td>' +
|
'<td>'+t.size+'</td>' +
|
||||||
'<td>'+label+'</td>' +
|
'<td>'+label+'</td>' +
|
||||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||||
|
'<td style="font-size:12px">'+status+detail+'</td>' +
|
||||||
'<td style="white-space:nowrap">' +
|
'<td style="white-space:nowrap">' +
|
||||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+idx+',this)">Audit JSON</button> ' +
|
(state
|
||||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+idx+',this)">Support Bundle</button>' +
|
? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
|
||||||
|
: '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
|
||||||
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||||
'</td></tr>';
|
'</td></tr>';
|
||||||
}).join('') + '</table>';
|
}).join('') + '</table>';
|
||||||
@@ -150,7 +172,7 @@ function usbRefresh() {
|
|||||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
window.usbExport = function(type, targetIndex, btn) {
|
window.blackboxEnable = function(targetIndex, btn) {
|
||||||
const target = (window._usbTargets || [])[targetIndex];
|
const target = (window._usbTargets || [])[targetIndex];
|
||||||
if (!target) {
|
if (!target) {
|
||||||
const msg = document.getElementById('usb-msg');
|
const msg = document.getElementById('usb-msg');
|
||||||
@@ -164,15 +186,15 @@ window.usbExport = function(type, targetIndex, btn) {
|
|||||||
const originalText = btn ? btn.textContent : '';
|
const originalText = btn ? btn.textContent : '';
|
||||||
if (btn) {
|
if (btn) {
|
||||||
btn.disabled = true;
|
btn.disabled = true;
|
||||||
btn.textContent = 'Exporting...';
|
btn.textContent = 'Enabling...';
|
||||||
}
|
}
|
||||||
if (rowMsg) {
|
if (rowMsg) {
|
||||||
rowMsg.style.color = 'var(--muted)';
|
rowMsg.style.color = 'var(--muted)';
|
||||||
rowMsg.textContent = 'Working...';
|
rowMsg.textContent = 'Working...';
|
||||||
}
|
}
|
||||||
msg.style.color = 'var(--muted)';
|
msg.style.color = 'var(--muted)';
|
||||||
msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
|
msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
|
||||||
fetch('/api/export/usb/'+type, {
|
fetch('/api/blackbox/enable', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {'Content-Type':'application/json'},
|
headers: {'Content-Type':'application/json'},
|
||||||
body: JSON.stringify(target)
|
body: JSON.stringify(target)
|
||||||
@@ -199,10 +221,64 @@ window.usbExport = function(type, targetIndex, btn) {
|
|||||||
btn.disabled = false;
|
btn.disabled = false;
|
||||||
btn.textContent = originalText;
|
btn.textContent = originalText;
|
||||||
}
|
}
|
||||||
|
setTimeout(blackboxRefresh, 300);
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
window.usbRefresh = usbRefresh;
|
window.blackboxDisable = function(targetIndex, btn) {
|
||||||
usbRefresh();
|
const target = (window._usbTargets || [])[targetIndex];
|
||||||
|
const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
|
||||||
|
if (!target || !active) {
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: black-box target not found. Refresh and try again.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
const row = btn ? btn.closest('td') : null;
|
||||||
|
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||||
|
const originalText = btn ? btn.textContent : '';
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Disabling...';
|
||||||
|
}
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--muted)';
|
||||||
|
rowMsg.textContent = 'Working...';
|
||||||
|
}
|
||||||
|
msg.style.color = 'var(--muted)';
|
||||||
|
msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
|
||||||
|
fetch('/api/blackbox/disable', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
|
||||||
|
}).then(async r => {
|
||||||
|
const d = await r.json();
|
||||||
|
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||||
|
return d;
|
||||||
|
}).then(d => {
|
||||||
|
msg.style.color = 'var(--ok,green)';
|
||||||
|
msg.textContent = d.message || 'Done.';
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--ok,green)';
|
||||||
|
rowMsg.textContent = d.message || 'Done.';
|
||||||
|
}
|
||||||
|
}).catch(e => {
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: '+e;
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--err,red)';
|
||||||
|
rowMsg.textContent = 'Error: ' + e;
|
||||||
|
}
|
||||||
|
}).finally(() => {
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = originalText;
|
||||||
|
}
|
||||||
|
setTimeout(blackboxRefresh, 300);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
window.blackboxRefresh = blackboxRefresh;
|
||||||
|
blackboxRefresh();
|
||||||
})();
|
})();
|
||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
@@ -382,7 +458,7 @@ function installToRAM() {
|
|||||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
` + renderSupportBundleInline() + `
|
` + renderSupportBundleInline() + `
|
||||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||||
<div style="font-weight:600;margin-bottom:8px">Export to USB</div>
|
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||||
` + renderUSBExportInline() + `
|
` + renderUSBExportInline() + `
|
||||||
</div>
|
</div>
|
||||||
</div></div>
|
</div></div>
|
||||||
|
|||||||
@@ -207,7 +207,7 @@ func renderInstall() string {
|
|||||||
func renderTasks() string {
|
func renderTasks() string {
|
||||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
|
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
|
||||||
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||||
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
||||||
</div>
|
</div>
|
||||||
@@ -289,7 +289,7 @@ function cancelAll() {
|
|||||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||||
}
|
}
|
||||||
function killWorkers() {
|
function killWorkers() {
|
||||||
if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
|
if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
|
||||||
fetch('/api/tasks/kill-workers',{method:'POST'})
|
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||||
.then(r=>r.json())
|
.then(r=>r.json())
|
||||||
.then(d=>{
|
.then(d=>{
|
||||||
|
|||||||
@@ -301,8 +301,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// Export
|
// Export
|
||||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||||
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
||||||
mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
|
mux.HandleFunc("GET /api/blackbox/status", h.handleAPIBlackboxStatus)
|
||||||
mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
|
mux.HandleFunc("POST /api/blackbox/enable", h.handleAPIBlackboxEnable)
|
||||||
|
mux.HandleFunc("POST /api/blackbox/disable", h.handleAPIBlackboxDisable)
|
||||||
|
|
||||||
// Tools
|
// Tools
|
||||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||||
|
|||||||
@@ -671,11 +671,11 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
|||||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||||
t.Fatalf("tools page missing boot source field: %s", body)
|
t.Fatalf("tools page missing boot source field: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `Export to USB`) {
|
if !strings.Contains(body, `USB Black-Box`) {
|
||||||
t.Fatalf("tools page missing export to usb section: %s", body)
|
t.Fatalf("tools page missing usb black-box section: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `Support Bundle</button>`) {
|
if !strings.Contains(body, `/api/blackbox/status`) {
|
||||||
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
t.Fatalf("tools page missing black-box status api usage: %s", body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
505
audit/internal/webui/task_runner.go
Normal file
505
audit/internal/webui/task_runner.go
Normal file
@@ -0,0 +1,505 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/runtimeenv"
|
||||||
|
)
|
||||||
|
|
||||||
|
type taskRunnerState struct {
|
||||||
|
PID int `json:"pid"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskRunnerStatePath(t *Task) string {
|
||||||
|
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return filepath.Join(t.ArtifactsDir, "runner-state.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskRunnerState(t *Task, state taskRunnerState) error {
|
||||||
|
path := taskRunnerStatePath(t)
|
||||||
|
if path == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(state, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tmp := path + ".tmp"
|
||||||
|
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.Rename(tmp, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
|
||||||
|
path := taskRunnerStatePath(t)
|
||||||
|
if path == "" {
|
||||||
|
return taskRunnerState{}, false
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return taskRunnerState{}, false
|
||||||
|
}
|
||||||
|
var state taskRunnerState
|
||||||
|
if err := json.Unmarshal(data, &state); err != nil {
|
||||||
|
return taskRunnerState{}, false
|
||||||
|
}
|
||||||
|
return state, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func processAlive(pid int) bool {
|
||||||
|
if pid <= 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
err := syscall.Kill(pid, 0)
|
||||||
|
return err == nil || err == syscall.EPERM
|
||||||
|
}
|
||||||
|
|
||||||
|
func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
|
||||||
|
now := time.Now()
|
||||||
|
t.DoneAt = &now
|
||||||
|
switch {
|
||||||
|
case cancelled:
|
||||||
|
t.Status = TaskCancelled
|
||||||
|
t.ErrMsg = "aborted"
|
||||||
|
case strings.TrimSpace(errMsg) != "":
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = errMsg
|
||||||
|
default:
|
||||||
|
t.Status = TaskDone
|
||||||
|
t.ErrMsg = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
|
||||||
|
if opts == nil {
|
||||||
|
j.append("ERROR: handler options not configured")
|
||||||
|
j.finish("handler options not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
a := opts.App
|
||||||
|
|
||||||
|
recovered := len(j.lines) > 0
|
||||||
|
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||||
|
if recovered {
|
||||||
|
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
archive string
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
|
||||||
|
switch t.Target {
|
||||||
|
case "nvidia":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
diagLevel := 2
|
||||||
|
if t.params.StressMode {
|
||||||
|
diagLevel = 3
|
||||||
|
}
|
||||||
|
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||||
|
result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
|
||||||
|
if e != nil {
|
||||||
|
err = e
|
||||||
|
} else {
|
||||||
|
archive = result.Body
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||||
|
}
|
||||||
|
case "nvidia-targeted-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if dur <= 0 {
|
||||||
|
dur = 300
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-bench-perf":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
SizeMB: t.params.SizeMB,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
RunNCCL: t.params.RunNCCL,
|
||||||
|
ParallelGPUs: t.params.ParallelGPUs,
|
||||||
|
RampStep: t.params.RampStep,
|
||||||
|
RampTotal: t.params.RampTotal,
|
||||||
|
RampRunID: t.params.RampRunID,
|
||||||
|
}, j.append)
|
||||||
|
case "nvidia-bench-power":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
RampStep: t.params.RampStep,
|
||||||
|
RampTotal: t.params.RampTotal,
|
||||||
|
RampRunID: t.params.RampRunID,
|
||||||
|
}, j.append)
|
||||||
|
case "nvidia-bench-autotune":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
SizeMB: t.params.SizeMB,
|
||||||
|
}, t.params.BenchmarkKind, j.append)
|
||||||
|
case "nvidia-compute":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if planErr != nil {
|
||||||
|
err = planErr
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||||
|
dur = rampPlan.DurationSec
|
||||||
|
}
|
||||||
|
if rampPlan.StaggerSeconds > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
|
||||||
|
case "nvidia-targeted-power":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-pulse":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-bandwidth":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-interconnect":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if planErr != nil {
|
||||||
|
err = planErr
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||||
|
dur = rampPlan.DurationSec
|
||||||
|
}
|
||||||
|
if rampPlan.StaggerSeconds > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||||
|
}
|
||||||
|
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||||
|
DurationSec: dur,
|
||||||
|
Loader: t.params.Loader,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
StaggerSeconds: rampPlan.StaggerSeconds,
|
||||||
|
}, j.append)
|
||||||
|
case "memory":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
|
||||||
|
j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
|
||||||
|
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||||
|
case "storage":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||||
|
case "cpu":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
if dur <= 0 {
|
||||||
|
if t.params.StressMode {
|
||||||
|
dur = 1800
|
||||||
|
} else {
|
||||||
|
dur = 60
|
||||||
|
}
|
||||||
|
}
|
||||||
|
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||||
|
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "amd":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-mem":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-bandwidth":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "memory-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "sat-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "platform-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||||
|
runOpts.Components = t.params.PlatformComponents
|
||||||
|
archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
|
||||||
|
case "audit":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
result, e := a.RunAuditNow(opts.RuntimeMode)
|
||||||
|
if e != nil {
|
||||||
|
err = e
|
||||||
|
} else {
|
||||||
|
for _, line := range splitLines(result.Body) {
|
||||||
|
j.append(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "support-bundle":
|
||||||
|
j.append("Building support bundle...")
|
||||||
|
archive, err = buildSupportBundle(opts.ExportDir)
|
||||||
|
case "install":
|
||||||
|
if strings.TrimSpace(t.params.Device) == "" {
|
||||||
|
err = fmt.Errorf("device is required")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
installLogPath := platform.InstallLogPath(t.params.Device)
|
||||||
|
j.append("Install log: " + installLogPath)
|
||||||
|
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
|
||||||
|
case "install-to-ram":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
err = a.RunInstallToRAM(ctx, j.append)
|
||||||
|
default:
|
||||||
|
j.append("ERROR: unknown target: " + t.Target)
|
||||||
|
j.finish("unknown target")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if archive != "" {
|
||||||
|
archivePath := app.ExtractArchivePath(archive)
|
||||||
|
if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||||
|
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||||
|
}
|
||||||
|
if opts.App != nil && opts.App.StatusDB != nil {
|
||||||
|
app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
j.append("Aborted.")
|
||||||
|
j.finish("aborted")
|
||||||
|
} else {
|
||||||
|
j.append("ERROR: " + err.Error())
|
||||||
|
j.finish(err.Error())
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if archive != "" {
|
||||||
|
j.append("Archive: " + archive)
|
||||||
|
}
|
||||||
|
j.finish("")
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadPersistedTask(statePath, taskID string) (*Task, error) {
|
||||||
|
data, err := os.ReadFile(statePath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var persisted []persistedTask
|
||||||
|
if err := json.Unmarshal(data, &persisted); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
for _, pt := range persisted {
|
||||||
|
if pt.ID != taskID {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
t := &Task{
|
||||||
|
ID: pt.ID,
|
||||||
|
Name: pt.Name,
|
||||||
|
Target: pt.Target,
|
||||||
|
Priority: pt.Priority,
|
||||||
|
Status: pt.Status,
|
||||||
|
CreatedAt: pt.CreatedAt,
|
||||||
|
StartedAt: pt.StartedAt,
|
||||||
|
DoneAt: pt.DoneAt,
|
||||||
|
ErrMsg: pt.ErrMsg,
|
||||||
|
LogPath: pt.LogPath,
|
||||||
|
ArtifactsDir: pt.ArtifactsDir,
|
||||||
|
ReportJSONPath: pt.ReportJSONPath,
|
||||||
|
ReportHTMLPath: pt.ReportHTMLPath,
|
||||||
|
params: pt.Params,
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(t)
|
||||||
|
return t, nil
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("task %s not found", taskID)
|
||||||
|
}
|
||||||
|
|
||||||
|
func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
|
||||||
|
if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
|
||||||
|
fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("resolve runtime for task-run", "err", err)
|
||||||
|
}
|
||||||
|
opts := &HandlerOptions{
|
||||||
|
ExportDir: exportDir,
|
||||||
|
App: app.New(platform.New()),
|
||||||
|
RuntimeMode: runtimeInfo.Mode,
|
||||||
|
}
|
||||||
|
statePath := filepath.Join(exportDir, "tasks-state.json")
|
||||||
|
task, err := loadPersistedTask(statePath, taskID)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintln(stderr, err.Error())
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
if task.StartedAt == nil || task.StartedAt.IsZero() {
|
||||||
|
now := time.Now()
|
||||||
|
task.StartedAt = &now
|
||||||
|
}
|
||||||
|
if task.Status == "" {
|
||||||
|
task.Status = TaskRunning
|
||||||
|
}
|
||||||
|
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||||
|
PID: os.Getpid(),
|
||||||
|
Status: TaskRunning,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
fmt.Fprintln(stderr, err.Error())
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||||
|
executeTaskWithOptions(opts, task, j, ctx)
|
||||||
|
finalizeTaskForResult(task, j.err, ctx.Err() != nil)
|
||||||
|
if err := writeTaskReportArtifacts(task); err != nil {
|
||||||
|
appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||||
|
}
|
||||||
|
j.closeLog()
|
||||||
|
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||||
|
PID: os.Getpid(),
|
||||||
|
Status: task.Status,
|
||||||
|
Error: task.ErrMsg,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
fmt.Fprintln(stderr, err.Error())
|
||||||
|
}
|
||||||
|
if task.ErrMsg != "" {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
@@ -13,6 +14,7 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
@@ -110,8 +112,9 @@ type Task struct {
|
|||||||
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
||||||
|
|
||||||
// runtime fields (not serialised)
|
// runtime fields (not serialised)
|
||||||
job *jobState
|
job *jobState
|
||||||
params taskParams
|
runnerPID int
|
||||||
|
params taskParams
|
||||||
}
|
}
|
||||||
|
|
||||||
// taskParams holds optional parameters parsed from the run request.
|
// taskParams holds optional parameters parsed from the run request.
|
||||||
@@ -328,6 +331,13 @@ var (
|
|||||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||||
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
||||||
}
|
}
|
||||||
|
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
|
||||||
|
exe, err := os.Executable()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return exec.Command(exe, "bee-worker", "--export-dir", exportDir, "--task-id", taskID), nil
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
// enqueue adds a task to the queue and notifies the worker.
|
// enqueue adds a task to the queue and notifies the worker.
|
||||||
@@ -365,6 +375,11 @@ func (q *taskQueue) prune() {
|
|||||||
|
|
||||||
// nextPending returns the highest-priority pending task (nil if none).
|
// nextPending returns the highest-priority pending task (nil if none).
|
||||||
func (q *taskQueue) nextPending() *Task {
|
func (q *taskQueue) nextPending() *Task {
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
if t.Status == TaskRunning {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
var best *Task
|
var best *Task
|
||||||
for _, t := range q.tasks {
|
for _, t := range q.tasks {
|
||||||
if t.Status != TaskPending {
|
if t.Status != TaskPending {
|
||||||
@@ -484,6 +499,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
|||||||
if !q.started {
|
if !q.started {
|
||||||
q.loadLocked()
|
q.loadLocked()
|
||||||
q.started = true
|
q.started = true
|
||||||
|
q.resumeRunningTasksLocked()
|
||||||
goRecoverLoop("task worker", 2*time.Second, q.worker)
|
goRecoverLoop("task worker", 2*time.Second, q.worker)
|
||||||
}
|
}
|
||||||
hasPending := q.nextPending() != nil
|
hasPending := q.nextPending() != nil
|
||||||
@@ -517,15 +533,12 @@ func (q *taskQueue) worker() {
|
|||||||
t.StartedAt = &now
|
t.StartedAt = &now
|
||||||
t.DoneAt = nil
|
t.DoneAt = nil
|
||||||
t.ErrMsg = ""
|
t.ErrMsg = ""
|
||||||
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
j := newTaskJobState(t.LogPath)
|
||||||
t.job = j
|
t.job = j
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
|
|
||||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
q.runTaskExternal(t, j)
|
||||||
j.cancel = taskCancel
|
|
||||||
q.executeTask(t, j, taskCtx)
|
|
||||||
taskCancel()
|
|
||||||
|
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
q.prune()
|
q.prune()
|
||||||
@@ -537,6 +550,207 @@ func (q *taskQueue) worker() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) resumeRunningTasksLocked() {
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
if t.Status != TaskRunning {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if t.job == nil {
|
||||||
|
t.job = newTaskJobState(t.LogPath)
|
||||||
|
}
|
||||||
|
q.attachExternalTaskControlsLocked(t, t.job)
|
||||||
|
q.startRecoveredTaskMonitorLocked(t, t.job)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) attachExternalTaskControlsLocked(t *Task, j *jobState) {
|
||||||
|
if t == nil || j == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
j.cancel = func() {
|
||||||
|
pid := t.runnerPID
|
||||||
|
if pid <= 0 {
|
||||||
|
if state, ok := readTaskRunnerState(t); ok {
|
||||||
|
pid = state.PID
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if pid > 0 {
|
||||||
|
_ = syscall.Kill(pid, syscall.SIGTERM)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
|
||||||
|
if t == nil || j == nil || t.runnerPID <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
goRecoverOnce("task runner monitor", func() {
|
||||||
|
stopTail := make(chan struct{})
|
||||||
|
doneTail := make(chan struct{})
|
||||||
|
go q.followTaskLog(t, j, stopTail, doneTail)
|
||||||
|
for processAlive(t.runnerPID) {
|
||||||
|
time.Sleep(500 * time.Millisecond)
|
||||||
|
}
|
||||||
|
close(stopTail)
|
||||||
|
<-doneTail
|
||||||
|
q.finishExternalTask(t, j, nil)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
|
||||||
|
stopTail := make(chan struct{})
|
||||||
|
doneTail := make(chan struct{})
|
||||||
|
defer func() {
|
||||||
|
close(stopTail)
|
||||||
|
<-doneTail
|
||||||
|
}()
|
||||||
|
go q.followTaskLog(t, j, stopTail, doneTail)
|
||||||
|
|
||||||
|
cmd, err := externalTaskRunnerCommand(q.opts.ExportDir, t.ID)
|
||||||
|
if err != nil {
|
||||||
|
j.appendFromLog("ERROR: " + err.Error())
|
||||||
|
q.finishExternalTask(t, j, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
j.appendFromLog("ERROR: " + err.Error())
|
||||||
|
q.finishExternalTask(t, j, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
q.mu.Lock()
|
||||||
|
t.runnerPID = cmd.Process.Pid
|
||||||
|
q.attachExternalTaskControlsLocked(t, j)
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
|
||||||
|
waitErr := cmd.Wait()
|
||||||
|
time.Sleep(200 * time.Millisecond)
|
||||||
|
q.finishExternalTask(t, j, waitErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) followTaskLog(t *Task, j *jobState, stop <-chan struct{}, done chan<- struct{}) {
|
||||||
|
defer close(done)
|
||||||
|
path := ""
|
||||||
|
if t != nil {
|
||||||
|
path = t.LogPath
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(path) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
offset := int64(0)
|
||||||
|
if info, err := os.Stat(path); err == nil {
|
||||||
|
offset = info.Size()
|
||||||
|
}
|
||||||
|
var partial string
|
||||||
|
ticker := time.NewTicker(250 * time.Millisecond)
|
||||||
|
defer ticker.Stop()
|
||||||
|
flush := func() {
|
||||||
|
data, newOffset, err := readTaskLogDelta(path, offset)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
offset = newOffset
|
||||||
|
return
|
||||||
|
}
|
||||||
|
offset = newOffset
|
||||||
|
text := partial + strings.ReplaceAll(string(data), "\r\n", "\n")
|
||||||
|
lines := strings.Split(text, "\n")
|
||||||
|
partial = lines[len(lines)-1]
|
||||||
|
for _, line := range lines[:len(lines)-1] {
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
j.appendFromLog(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
flush()
|
||||||
|
case <-stop:
|
||||||
|
flush()
|
||||||
|
if strings.TrimSpace(partial) != "" {
|
||||||
|
j.appendFromLog(partial)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func readTaskLogDelta(path string, offset int64) ([]byte, int64, error) {
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, offset, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
info, err := f.Stat()
|
||||||
|
if err != nil {
|
||||||
|
return nil, offset, err
|
||||||
|
}
|
||||||
|
if info.Size() < offset {
|
||||||
|
offset = 0
|
||||||
|
}
|
||||||
|
if _, err := f.Seek(offset, io.SeekStart); err != nil {
|
||||||
|
return nil, offset, err
|
||||||
|
}
|
||||||
|
data, err := io.ReadAll(io.LimitReader(f, 1<<20))
|
||||||
|
return data, offset + int64(len(data)), err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) finishExternalTask(t *Task, j *jobState, waitErr error) {
|
||||||
|
q.mu.Lock()
|
||||||
|
defer q.mu.Unlock()
|
||||||
|
if t.Status == TaskDone || t.Status == TaskFailed || t.Status == TaskCancelled {
|
||||||
|
if j != nil && !j.isDone() {
|
||||||
|
j.finish(t.ErrMsg)
|
||||||
|
j.closeLog()
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case q.trigger <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
state, ok := readTaskRunnerState(t)
|
||||||
|
switch {
|
||||||
|
case ok && state.Status != TaskRunning:
|
||||||
|
t.Status = state.Status
|
||||||
|
t.ErrMsg = state.Error
|
||||||
|
now := state.UpdatedAt
|
||||||
|
if now.IsZero() {
|
||||||
|
now = time.Now()
|
||||||
|
}
|
||||||
|
t.DoneAt = &now
|
||||||
|
case waitErr != nil:
|
||||||
|
now := time.Now()
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = waitErr.Error()
|
||||||
|
t.DoneAt = &now
|
||||||
|
default:
|
||||||
|
now := time.Now()
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = "task runner exited without final state"
|
||||||
|
t.DoneAt = &now
|
||||||
|
}
|
||||||
|
t.runnerPID = 0
|
||||||
|
q.finalizeTaskArtifactPathsLocked(t)
|
||||||
|
q.persistLocked()
|
||||||
|
|
||||||
|
if j != nil && !j.isDone() {
|
||||||
|
j.finish(t.ErrMsg)
|
||||||
|
j.closeLog()
|
||||||
|
}
|
||||||
|
if t.ErrMsg != "" {
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
||||||
|
} else {
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case q.trigger <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
|
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
|
||||||
startedKmsgWatch := false
|
startedKmsgWatch := false
|
||||||
defer q.finalizeTaskRun(t, j)
|
defer q.finalizeTaskRun(t, j)
|
||||||
@@ -985,15 +1199,11 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
|||||||
taskSerialEvent(t, "finished with status="+t.Status)
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job == nil || !t.job.abort() {
|
||||||
t.job.abort()
|
writeError(w, http.StatusConflict, "task is not cancellable")
|
||||||
|
return
|
||||||
}
|
}
|
||||||
t.Status = TaskCancelled
|
writeJSON(w, map[string]string{"status": "aborting"})
|
||||||
now := time.Now()
|
|
||||||
t.DoneAt = &now
|
|
||||||
globalQueue.persistLocked()
|
|
||||||
taskSerialEvent(t, "finished with status="+t.Status)
|
|
||||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
|
||||||
default:
|
default:
|
||||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||||
}
|
}
|
||||||
@@ -1039,12 +1249,6 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
|||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
t.job.abort()
|
t.job.abort()
|
||||||
}
|
}
|
||||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
|
||||||
platform.KillTestWorkers()
|
|
||||||
}
|
|
||||||
t.Status = TaskCancelled
|
|
||||||
t.DoneAt = &now
|
|
||||||
taskSerialEvent(t, "finished with status="+t.Status)
|
|
||||||
n++
|
n++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1175,18 +1379,29 @@ func (q *taskQueue) loadLocked() {
|
|||||||
}
|
}
|
||||||
q.assignTaskLogPathLocked(t)
|
q.assignTaskLogPathLocked(t)
|
||||||
if t.Status == TaskRunning {
|
if t.Status == TaskRunning {
|
||||||
// The task was interrupted by a bee-web restart. Child processes
|
state, ok := readTaskRunnerState(t)
|
||||||
// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
|
switch {
|
||||||
// their own process groups. Kill any matching stale workers before
|
case ok && state.Status == TaskRunning && processAlive(state.PID):
|
||||||
// marking the task failed so the next GPU test does not inherit a
|
t.runnerPID = state.PID
|
||||||
// busy DCGM slot or duplicate workers.
|
t.job = newTaskJobState(t.LogPath)
|
||||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
case ok && state.Status != TaskRunning:
|
||||||
_ = platform.KillTestWorkers()
|
t.runnerPID = state.PID
|
||||||
|
t.Status = state.Status
|
||||||
|
t.ErrMsg = state.Error
|
||||||
|
now := state.UpdatedAt
|
||||||
|
if now.IsZero() {
|
||||||
|
now = time.Now()
|
||||||
|
}
|
||||||
|
t.DoneAt = &now
|
||||||
|
default:
|
||||||
|
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||||
|
_ = platform.KillTestWorkers()
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.DoneAt = &now
|
||||||
|
t.ErrMsg = "interrupted by bee-web restart"
|
||||||
}
|
}
|
||||||
now := time.Now()
|
|
||||||
t.Status = TaskFailed
|
|
||||||
t.DoneAt = &now
|
|
||||||
t.ErrMsg = "interrupted by bee-web restart"
|
|
||||||
} else if t.Status == TaskPending {
|
} else if t.Status == TaskPending {
|
||||||
t.StartedAt = nil
|
t.StartedAt = nil
|
||||||
t.DoneAt = nil
|
t.DoneAt = nil
|
||||||
|
|||||||
2
bible
2
bible
Submodule bible updated: 98448c993f...1d89a4918e
@@ -23,9 +23,9 @@ insmod serial
|
|||||||
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
||||||
|
|
||||||
insmod gfxterm
|
insmod gfxterm
|
||||||
insmod png
|
|
||||||
|
|
||||||
source /boot/grub/theme.cfg
|
|
||||||
|
|
||||||
terminal_input console serial
|
terminal_input console serial
|
||||||
terminal_output gfxterm serial
|
terminal_output gfxterm serial
|
||||||
|
|
||||||
|
insmod png
|
||||||
|
source /boot/grub/theme.cfg
|
||||||
|
|||||||
@@ -1,15 +1,5 @@
|
|||||||
source /boot/grub/config.cfg
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
|
||||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
|
||||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
|
||||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
|
||||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
|
||||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
|
||||||
echo " Hardware Audit LiveCD"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 70 KiB After Width: | Height: | Size: 78 KiB |
@@ -34,11 +34,11 @@ terminal-font: "Unifont Regular 16"
|
|||||||
item_font = "Unifont Regular 16"
|
item_font = "Unifont Regular 16"
|
||||||
selected_item_color= "#f5a800"
|
selected_item_color= "#f5a800"
|
||||||
selected_item_font = "Unifont Regular 16"
|
selected_item_font = "Unifont Regular 16"
|
||||||
item_height = 16
|
item_height = 20
|
||||||
item_padding = 0
|
item_padding = 2
|
||||||
item_spacing = 4
|
item_spacing = 4
|
||||||
icon_width = 0
|
icon_width = 0
|
||||||
icon_heigh = 0
|
icon_height = 0
|
||||||
item_icon_space = 0
|
item_icon_space = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit
|
Description=Bee: hardware audit
|
||||||
After=bee-preflight.service bee-network.service bee-nvidia.service
|
After=bee-preflight.service bee-network.service bee-nvidia.service bee-blackbox.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
|
|||||||
18
iso/overlay/etc/systemd/system/bee-blackbox.service
Normal file
18
iso/overlay/etc/systemd/system/bee-blackbox.service
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Bee: USB black-box log mirror
|
||||||
|
After=local-fs.target
|
||||||
|
Before=bee-network.service bee-nvidia.service bee-preflight.service bee-audit.service bee-web.service
|
||||||
|
StartLimitIntervalSec=0
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-blackbox.log /usr/local/bin/bee blackbox --export-dir /appdata/bee/export --state-file /appdata/bee/export/blackbox-state.json
|
||||||
|
Restart=always
|
||||||
|
RestartSec=1
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
OOMScoreAdjust=-900
|
||||||
|
Nice=0
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: bring up network interfaces via DHCP
|
Description=Bee: bring up network interfaces via DHCP
|
||||||
After=local-fs.target
|
After=local-fs.target bee-blackbox.service
|
||||||
Before=network-online.target bee-audit.service
|
Before=network-online.target bee-audit.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: load NVIDIA kernel modules and create device nodes
|
Description=Bee: load NVIDIA kernel modules and create device nodes
|
||||||
After=local-fs.target udev.service
|
After=local-fs.target udev.service bee-blackbox.service
|
||||||
Before=bee-audit.service
|
Before=bee-audit.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: runtime preflight self-check
|
Description=Bee: runtime preflight self-check
|
||||||
After=bee-network.service bee-nvidia.service
|
After=bee-network.service bee-nvidia.service bee-blackbox.service
|
||||||
Before=bee-audit.service
|
Before=bee-audit.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit web viewer
|
Description=Bee: hardware audit web viewer
|
||||||
|
After=bee-blackbox.service
|
||||||
StartLimitIntervalSec=0
|
StartLimitIntervalSec=0
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
@@ -60,35 +60,129 @@ wait_for_process_exit() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
kill_pattern() {
|
log_pid_details() {
|
||||||
pattern="$1"
|
pid="$1"
|
||||||
if pgrep -f "$pattern" >/dev/null 2>&1; then
|
line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
|
||||||
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
if [ -n "$line" ]; then
|
||||||
|
log_blocker "$line"
|
||||||
|
else
|
||||||
|
log_blocker "pid $pid"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
collect_gpu_compute_pids() {
|
||||||
|
index="$1"
|
||||||
|
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
nvidia-smi --id="$index" \
|
||||||
|
--query-compute-apps=pid \
|
||||||
|
--format=csv,noheader,nounits 2>/dev/null \
|
||||||
|
| sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
|
||||||
|
| grep -E '^[0-9]+$' || true
|
||||||
|
}
|
||||||
|
|
||||||
|
collect_gpu_device_pids() {
|
||||||
|
index="$1"
|
||||||
|
dev="/dev/nvidia$index"
|
||||||
|
[ -e "$dev" ] || return 0
|
||||||
|
if command -v fuser >/dev/null 2>&1; then
|
||||||
|
fuser "$dev" 2>/dev/null \
|
||||||
|
| tr ' ' '\n' \
|
||||||
|
| sed 's/[^0-9].*$//' \
|
||||||
|
| grep -E '^[0-9]+$' || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
collect_gpu_holder_pids() {
|
||||||
|
index="$1"
|
||||||
|
{
|
||||||
|
collect_gpu_compute_pids "$index"
|
||||||
|
collect_gpu_device_pids "$index"
|
||||||
|
} | awk 'NF' | sort -u
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_pid_list() {
|
||||||
|
pids="$1"
|
||||||
|
[ -n "$pids" ] || return 0
|
||||||
|
|
||||||
|
for pid in $pids; do
|
||||||
|
log_pid_details "$pid"
|
||||||
|
done
|
||||||
|
log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
|
||||||
|
for pid in $pids; do
|
||||||
|
kill -TERM "$pid" >/dev/null 2>&1 || true
|
||||||
|
done
|
||||||
|
sleep 1
|
||||||
|
for pid in $pids; do
|
||||||
|
if kill -0 "$pid" >/dev/null 2>&1; then
|
||||||
|
log "forcing GPU holder PID $pid to exit"
|
||||||
|
kill -KILL "$pid" >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
gpu_has_display_holders() {
|
||||||
|
index="$1"
|
||||||
|
holders=$(collect_gpu_device_pids "$index")
|
||||||
|
[ -n "$holders" ] || return 1
|
||||||
|
for pid in $holders; do
|
||||||
|
comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
||||||
|
case "$comm" in
|
||||||
|
Xorg|Xwayland|X|gnome-shell)
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_nv_hostengine_if_running() {
|
||||||
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||||
[ -n "$line" ] || continue
|
[ -n "$line" ] || continue
|
||||||
log_blocker "$line"
|
log_blocker "$line"
|
||||||
done
|
done
|
||||||
log "killing processes matching: $pattern"
|
log "stopping nv-hostengine"
|
||||||
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
|
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
sleep 1
|
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
|
hostengine_was_active=1
|
||||||
|
return 0
|
||||||
fi
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_fabricmanager_if_active() {
|
||||||
|
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||||
|
log_blocker "service nvidia-fabricmanager.service"
|
||||||
|
fabric_was_active=1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_display_stack_if_active() {
|
||||||
|
stopped=1
|
||||||
|
for unit in display-manager.service lightdm.service; do
|
||||||
|
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||||
|
log_blocker "service $unit"
|
||||||
|
display_was_active=1
|
||||||
|
stopped=0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return "$stopped"
|
||||||
|
}
|
||||||
|
|
||||||
|
try_gpu_reset() {
|
||||||
|
index="$1"
|
||||||
|
log "resetting GPU $index"
|
||||||
|
nvidia-smi -r -i "$index"
|
||||||
}
|
}
|
||||||
|
|
||||||
drain_gpu_clients() {
|
drain_gpu_clients() {
|
||||||
display_was_active=0
|
display_was_active=0
|
||||||
fabric_was_active=0
|
fabric_was_active=0
|
||||||
|
hostengine_was_active=0
|
||||||
for unit in display-manager.service lightdm.service; do
|
|
||||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
|
||||||
log_blocker "service $unit"
|
|
||||||
display_was_active=1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
|
||||||
log_blocker "service nvidia-fabricmanager.service"
|
|
||||||
fabric_was_active=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||||
@@ -98,21 +192,25 @@ drain_gpu_clients() {
|
|||||||
log "stopping nv-hostengine"
|
log "stopping nv-hostengine"
|
||||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
hostengine_was_active=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
for pattern in \
|
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||||
"nvidia-smi" \
|
log_blocker "service nvidia-fabricmanager.service"
|
||||||
"dcgmi" \
|
fabric_was_active=1
|
||||||
"nvvs" \
|
fi
|
||||||
"dcgmproftester" \
|
|
||||||
"all_reduce_perf" \
|
for unit in display-manager.service lightdm.service; do
|
||||||
"nvtop" \
|
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||||
"bee-gpu-burn" \
|
log_blocker "service $unit"
|
||||||
"bee-john-gpu-stress" \
|
display_was_active=1
|
||||||
"bee-nccl-gpu-stress" \
|
fi
|
||||||
"Xorg" \
|
done
|
||||||
"Xwayland"; do
|
|
||||||
kill_pattern "$pattern"
|
for dev in /dev/nvidia[0-9]*; do
|
||||||
|
[ -e "$dev" ] || continue
|
||||||
|
holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
|
||||||
|
kill_pid_list "$holders"
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -125,7 +223,7 @@ restore_gpu_clients() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
log "starting nv-hostengine"
|
log "starting nv-hostengine"
|
||||||
nv-hostengine
|
nv-hostengine
|
||||||
fi
|
fi
|
||||||
@@ -153,10 +251,60 @@ restart_drivers() {
|
|||||||
|
|
||||||
reset_gpu() {
|
reset_gpu() {
|
||||||
index="$1"
|
index="$1"
|
||||||
drain_gpu_clients
|
display_was_active=0
|
||||||
log "resetting GPU $index"
|
fabric_was_active=0
|
||||||
nvidia-smi -r -i "$index"
|
hostengine_was_active=0
|
||||||
|
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
stop_nv_hostengine_if_running || true
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
stop_fabricmanager_if_active || true
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if gpu_has_display_holders "$index"; then
|
||||||
|
stop_display_stack_if_active || true
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
log "GPU $index still has holders after targeted drain"
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
try_gpu_reset "$index"
|
||||||
|
rc=$?
|
||||||
restore_gpu_clients
|
restore_gpu_clients
|
||||||
|
return "$rc"
|
||||||
}
|
}
|
||||||
|
|
||||||
cmd="${1:-}"
|
cmd="${1:-}"
|
||||||
|
|||||||
@@ -47,6 +47,13 @@ echo "==> Сборка бинарника..."
|
|||||||
)
|
)
|
||||||
echo " OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
|
echo " OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
|
||||||
|
|
||||||
|
LOCAL_SHA="$(shasum -a 256 "${LOCAL_BIN}" | awk '{print $1}')"
|
||||||
|
REMOTE_SHA="$("${SSH_CMD[@]}" "$REMOTE" "if [ -f '${REMOTE_BIN}' ] && command -v sha256sum >/dev/null 2>&1; then sha256sum '${REMOTE_BIN}' | awk '{print \\$1}'; fi" 2>/dev/null || true)"
|
||||||
|
if [[ -n "${REMOTE_SHA}" && "${LOCAL_SHA}" == "${REMOTE_SHA}" ]]; then
|
||||||
|
echo "==> Бинарник не изменился (${LOCAL_SHA}); копирование и перезапуск сервисов пропущены."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
# --- Deploy ---
|
# --- Deploy ---
|
||||||
echo "==> Копирование на ${REMOTE}..."
|
echo "==> Копирование на ${REMOTE}..."
|
||||||
"${SCP_CMD[@]}" "${LOCAL_BIN}" "${REMOTE}:/tmp/bee-new"
|
"${SCP_CMD[@]}" "${LOCAL_BIN}" "${REMOTE}:/tmp/bee-new"
|
||||||
|
|||||||
Reference in New Issue
Block a user