Files
bee/audit/internal/webui/task_runner.go
Mikhail Chusavitin 8575cf06f8 webui: show all RAID drives per controller and add drive-prepare action
RAID Controller Management previously hid any LSI drive that wasn't
already Frgn/UGood/JBOD, and scoped VROC "free drives" from all system
disks instead of the ones actually wired to the VROC controller's
ports - drives attached directly to the CPU or another HBA could leak
in. Now every drive is listed per its own controller, and LSI drives
not already ready for array creation get a "Prepare" button that
forces them to Unconfigured Good via storcli.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
2026-07-01 13:32:03 +03:00

552 lines
15 KiB
Go

package webui
import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"os"
"os/signal"
"path/filepath"
"strings"
"syscall"
"time"
"bee/audit/internal/app"
"bee/audit/internal/platform"
"bee/audit/internal/runtimeenv"
)
type taskRunnerState struct {
PID int `json:"pid"`
Status string `json:"status"`
Error string `json:"error,omitempty"`
UpdatedAt time.Time `json:"updated_at"`
}
func taskRunnerStatePath(t *Task) string {
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
return ""
}
return filepath.Join(t.ArtifactsDir, "runner-state.json")
}
func writeTaskRunnerState(t *Task, state taskRunnerState) error {
path := taskRunnerStatePath(t)
if path == "" {
return nil
}
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return err
}
data, err := json.MarshalIndent(state, "", " ")
if err != nil {
return err
}
tmp := path + ".tmp"
if err := os.WriteFile(tmp, data, 0644); err != nil {
return err
}
return os.Rename(tmp, path)
}
func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
path := taskRunnerStatePath(t)
if path == "" {
return taskRunnerState{}, false
}
data, err := os.ReadFile(path)
if err != nil || len(data) == 0 {
return taskRunnerState{}, false
}
var state taskRunnerState
if err := json.Unmarshal(data, &state); err != nil {
return taskRunnerState{}, false
}
return state, true
}
func processAlive(pid int) bool {
if pid <= 0 {
return false
}
err := syscall.Kill(pid, 0)
return err == nil || err == syscall.EPERM
}
func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
now := time.Now()
t.DoneAt = &now
switch {
case cancelled:
t.Status = TaskCancelled
t.ErrMsg = "aborted"
case strings.TrimSpace(errMsg) != "":
t.Status = TaskFailed
t.ErrMsg = errMsg
default:
t.Status = TaskDone
t.ErrMsg = ""
}
}
func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
if opts == nil {
j.append("ERROR: handler options not configured")
j.finish("handler options not configured")
return
}
a := opts.App
recovered := len(j.lines) > 0
j.append(fmt.Sprintf("Starting %s...", t.Name))
if recovered {
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
}
var (
archive string
err error
)
switch t.Target {
case "nvidia":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
diagLevel := 2
if t.params.StressMode {
diagLevel = 3
}
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
if e != nil {
err = e
} else {
archive = result.Body
}
} else {
archive, err = a.RunNvidiaAcceptancePack("", j.append)
}
case "nvidia-targeted-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if dur <= 0 {
dur = 300
}
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-bench-perf":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
SizeMB: t.params.SizeMB,
GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
RunNCCL: t.params.RunNCCL,
ParallelGPUs: t.params.ParallelGPUs,
RampStep: t.params.RampStep,
RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID,
}, j.append)
case "nvidia-bench-power":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
RampStep: t.params.RampStep,
RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID,
}, j.append)
case "nvidia-bench-autotune":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
SizeMB: t.params.SizeMB,
}, t.params.BenchmarkKind, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
if planErr != nil {
err = planErr
break
}
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
dur = rampPlan.DurationSec
}
if rampPlan.StaggerSeconds > 0 {
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
}
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
case "nvidia-targeted-power":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-pulse":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-bandwidth":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
case "nvidia-interconnect":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
case "nvidia-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
if planErr != nil {
err = planErr
break
}
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
dur = rampPlan.DurationSec
}
if rampPlan.StaggerSeconds > 0 {
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: t.params.Loader,
GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
StaggerSeconds: rampPlan.StaggerSeconds,
}, j.append)
case "memory":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
case "storage":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
case "cpu":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
if dur <= 0 {
if t.params.StressMode {
dur = 1800
} else {
dur = 60
}
}
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
case "amd":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
case "amd-mem":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
case "amd-bandwidth":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
case "amd-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
case "memory-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
case "sat-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
case "platform-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
runOpts.Components = t.params.PlatformComponents
archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
case "audit":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
result, e := a.RunAuditNow(opts.RuntimeMode)
if e != nil {
err = e
} else {
for _, line := range splitLines(result.Body) {
j.append(line)
}
}
case "support-bundle":
j.append("Building support bundle...")
archive, err = buildSupportBundle(opts.ExportDir)
case "install":
if strings.TrimSpace(t.params.Device) == "" {
err = fmt.Errorf("device is required")
break
}
installLogPath := platform.InstallLogPath(t.params.Device)
j.append("Install log: " + installLogPath)
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
case "install-to-ram":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
err = a.RunInstallToRAM(ctx, j.append)
case "nvme-format":
if strings.TrimSpace(t.params.Device) == "" {
err = fmt.Errorf("device is required")
break
}
err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
case "saa-dmi-write":
if len(t.params.SAADmiChanges) == 0 {
err = fmt.Errorf("no changes provided")
break
}
err = runSAADMIWriteTask(ctx, j, opts.ExportDir, t.params)
case "ipmi-fru-write":
if len(t.params.FRUChanges) == 0 {
err = fmt.Errorf("no changes provided")
break
}
err = runIPMIFRUWriteTask(ctx, j, opts.ExportDir, t.params)
case "huawei-elabel-write":
if len(t.params.HuaweiElabelChanges) == 0 {
err = fmt.Errorf("no changes provided")
break
}
err = runHuaweiElabelWriteTask(ctx, j, t.params)
case "raid-foreign-clear":
err = runRAIDForeignClearTask(ctx, j, t.params.RAIDController)
case "raid-foreign-import":
err = runRAIDForeignImportTask(ctx, j, t.params.RAIDController)
case "raid-lsi-create-mirror":
if len(t.params.RAIDDevices) < 2 {
err = fmt.Errorf("at least 2 drives required")
break
}
err = runRAIDLSICreateMirrorTask(ctx, j, t.params.RAIDController, t.params.RAIDDevices)
case "raid-lsi-prepare-drive":
if strings.TrimSpace(t.params.RAIDSlot) == "" {
err = fmt.Errorf("no drive slot provided")
break
}
err = runRAIDPrepareDriveTask(ctx, j, t.params.RAIDController, t.params.RAIDSlot)
case "raid-vroc-create-mirror":
if len(t.params.RAIDDevices) < 2 {
err = fmt.Errorf("at least 2 devices required")
break
}
err = runRAIDVROCCreateMirrorTask(ctx, j, t.params.RAIDDevices, t.params.RAIDArrayName)
default:
j.append("ERROR: unknown target: " + t.Target)
j.finish("unknown target")
return
}
if archive != "" {
archivePath := app.ExtractArchivePath(archive)
if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
}
if opts.App != nil && opts.App.StatusDB != nil {
app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
}
}
if err != nil {
if ctx.Err() != nil {
j.append("Aborted.")
j.finish("aborted")
} else {
j.append("ERROR: " + err.Error())
j.finish(err.Error())
}
return
}
if archive != "" {
j.append("Archive: " + archive)
}
j.finish("")
}
func loadPersistedTask(statePath, taskID string) (*Task, error) {
data, err := os.ReadFile(statePath)
if err != nil {
return nil, err
}
var persisted []persistedTask
if err := json.Unmarshal(data, &persisted); err != nil {
return nil, err
}
for _, pt := range persisted {
if pt.ID != taskID {
continue
}
t := &Task{
ID: pt.ID,
Name: pt.Name,
Target: pt.Target,
Priority: pt.Priority,
Status: pt.Status,
CreatedAt: pt.CreatedAt,
StartedAt: pt.StartedAt,
DoneAt: pt.DoneAt,
ErrMsg: pt.ErrMsg,
LogPath: pt.LogPath,
ArtifactsDir: pt.ArtifactsDir,
ReportJSONPath: pt.ReportJSONPath,
ReportHTMLPath: pt.ReportHTMLPath,
params: pt.Params,
}
ensureTaskReportPaths(t)
return t, nil
}
return nil, fmt.Errorf("task %s not found", taskID)
}
func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
return 2
}
runtimeInfo, err := runtimeenv.Detect("auto")
if err != nil {
slog.Warn("resolve runtime for task-run", "err", err)
}
opts := &HandlerOptions{
ExportDir: exportDir,
App: app.New(platform.New()),
RuntimeMode: runtimeInfo.Mode,
}
statePath := filepath.Join(exportDir, "tasks-state.json")
task, err := loadPersistedTask(statePath, taskID)
if err != nil {
fmt.Fprintln(stderr, err.Error())
return 1
}
if task.StartedAt == nil || task.StartedAt.IsZero() {
now := time.Now()
task.StartedAt = &now
}
if task.Status == "" {
task.Status = TaskRunning
}
if err := writeTaskRunnerState(task, taskRunnerState{
PID: os.Getpid(),
Status: TaskRunning,
UpdatedAt: time.Now().UTC(),
}); err != nil {
fmt.Fprintln(stderr, err.Error())
return 1
}
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer cancel()
j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
executeTaskWithOptions(opts, task, j, ctx)
finalizeTaskForResult(task, j.err, ctx.Err() != nil)
if err := writeTaskReportArtifacts(task); err != nil {
appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
}
j.closeLog()
if err := writeTaskRunnerState(task, taskRunnerState{
PID: os.Getpid(),
Status: task.Status,
Error: task.ErrMsg,
UpdatedAt: time.Now().UTC(),
}); err != nil {
fmt.Fprintln(stderr, err.Error())
}
if task.ErrMsg != "" {
return 1
}
return 0
}