Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| be4b439804 | |||
| 749fc8a94d | |||
| 6112094d45 | |||
| e9a2bc9f9d | |||
|
|
7a8f884664 | ||
|
|
8bf8dfa45b | ||
|
|
6a22199aff | ||
|
|
ddb2bb5d1c | ||
|
|
aa284ae754 | ||
|
|
8512098174 | ||
|
|
6b5d22c194 | ||
|
|
a35e90a93e | ||
|
|
1ced81707f | ||
|
|
679aeb9947 | ||
|
|
647e99b697 | ||
|
|
4af997f436 | ||
|
|
6caace0cc0 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -3,3 +3,4 @@
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
audit/bee
|
||||
|
||||
@@ -71,6 +71,8 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||
return runSAT(args[1:], stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark(args[1:], stdout, stderr)
|
||||
case "bee-worker":
|
||||
return runBeeWorker(args[1:], stdout, stderr)
|
||||
case "version", "--version", "-version":
|
||||
fmt.Fprintln(stdout, Version)
|
||||
return 0
|
||||
@@ -90,6 +92,7 @@ func printRootUsage(w io.Writer) {
|
||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||
bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
|
||||
bee version
|
||||
bee help [command]`)
|
||||
}
|
||||
@@ -110,6 +113,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||
return runSAT([]string{"--help"}, stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||
case "bee-worker":
|
||||
return runBeeWorker([]string{"--help"}, stdout, stderr)
|
||||
case "version":
|
||||
fmt.Fprintln(stdout, "usage: bee version")
|
||||
return 0
|
||||
@@ -462,6 +467,28 @@ func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBeeWorker(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
|
||||
taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
|
||||
}
|
||||
|
||||
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
|
||||
@@ -67,6 +67,13 @@ type benchmarkPowerCalibrationResult struct {
|
||||
MetricRows []GPUMetricRow
|
||||
}
|
||||
|
||||
type benchmarkPowerCalibrationRunSummary struct {
|
||||
LoadedSDR benchmarkSDRSeriesSummary
|
||||
AvgFanRPM float64
|
||||
AvgFanDutyCyclePct float64
|
||||
FanSamples int
|
||||
}
|
||||
|
||||
type benchmarkBurnProfile struct {
|
||||
name string
|
||||
category string
|
||||
@@ -98,6 +105,7 @@ var (
|
||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||
benchmarkGeteuid = os.Geteuid
|
||||
benchmarkResetNvidiaGPU = resetNvidiaGPU
|
||||
benchmarkSleep = time.Sleep
|
||||
)
|
||||
|
||||
@@ -242,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
||||
return nil
|
||||
}
|
||||
|
||||
func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
|
||||
}
|
||||
out, err := benchmarkResetNvidiaGPU(gpuIndex)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||
"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
|
||||
)
|
||||
if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
|
||||
for _, line := range strings.Split(trimmed, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
}
|
||||
rc := 0
|
||||
if err != nil {
|
||||
rc = 1
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||
fmt.Sprintf("rc: %d", rc),
|
||||
"",
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||
if len(gpuIndices) == 0 {
|
||||
return nil
|
||||
@@ -259,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int
|
||||
}
|
||||
var failed []int
|
||||
for _, idx := range gpuIndices {
|
||||
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
||||
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
||||
if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
|
||||
failed = append(failed, idx)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||
@@ -2413,6 +2449,16 @@ type sdrPowerSnapshot struct {
|
||||
SkippedSensors []string // sensors rejected during self-healing
|
||||
}
|
||||
|
||||
type benchmarkSDRSeriesSummary struct {
|
||||
PSUInW float64
|
||||
PSUOutW float64
|
||||
GPUSlotW float64
|
||||
PSUSlots map[string]BenchmarkPSUSlotPower
|
||||
Samples int
|
||||
|
||||
SkippedSensors []string
|
||||
}
|
||||
|
||||
// sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
|
||||
type sdrSensor struct {
|
||||
name string
|
||||
@@ -2542,6 +2588,137 @@ func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
|
||||
return snap
|
||||
}
|
||||
|
||||
func startIPMISDRSampler(stopCh <-chan struct{}, intervalSec int) <-chan []sdrPowerSnapshot {
|
||||
if intervalSec <= 0 {
|
||||
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||
}
|
||||
ch := make(chan []sdrPowerSnapshot, 1)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
var samples []sdrPowerSnapshot
|
||||
record := func() {
|
||||
snap := sampleIPMISDRPowerSensors()
|
||||
if snap.PSUInW <= 0 && snap.PSUOutW <= 0 && snap.GPUSlotW <= 0 && len(snap.PSUSlots) == 0 {
|
||||
return
|
||||
}
|
||||
samples = append(samples, snap)
|
||||
}
|
||||
record()
|
||||
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
ch <- samples
|
||||
return
|
||||
case <-ticker.C:
|
||||
record()
|
||||
}
|
||||
}
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
func summarizeSDRPowerSeries(samples []sdrPowerSnapshot) benchmarkSDRSeriesSummary {
|
||||
var summary benchmarkSDRSeriesSummary
|
||||
if len(samples) == 0 {
|
||||
return summary
|
||||
}
|
||||
|
||||
type slotAggregate struct {
|
||||
inputs []float64
|
||||
outputs []float64
|
||||
status string
|
||||
}
|
||||
|
||||
slotAgg := make(map[string]*slotAggregate)
|
||||
skippedSet := make(map[string]struct{})
|
||||
var inputTotals []float64
|
||||
var outputTotals []float64
|
||||
var gpuSlotTotals []float64
|
||||
|
||||
for _, sample := range samples {
|
||||
if sample.PSUInW > 0 {
|
||||
inputTotals = append(inputTotals, sample.PSUInW)
|
||||
}
|
||||
if sample.PSUOutW > 0 {
|
||||
outputTotals = append(outputTotals, sample.PSUOutW)
|
||||
}
|
||||
if sample.GPUSlotW > 0 {
|
||||
gpuSlotTotals = append(gpuSlotTotals, sample.GPUSlotW)
|
||||
}
|
||||
for _, skipped := range sample.SkippedSensors {
|
||||
if skipped != "" {
|
||||
skippedSet[skipped] = struct{}{}
|
||||
}
|
||||
}
|
||||
for slot, reading := range sample.PSUSlots {
|
||||
agg := slotAgg[slot]
|
||||
if agg == nil {
|
||||
agg = &slotAggregate{}
|
||||
slotAgg[slot] = agg
|
||||
}
|
||||
if reading.InputW != nil && *reading.InputW > 0 {
|
||||
agg.inputs = append(agg.inputs, *reading.InputW)
|
||||
}
|
||||
if reading.OutputW != nil && *reading.OutputW > 0 {
|
||||
agg.outputs = append(agg.outputs, *reading.OutputW)
|
||||
}
|
||||
switch {
|
||||
case reading.Status == "":
|
||||
case agg.status == "":
|
||||
agg.status = reading.Status
|
||||
case agg.status == "OK" && reading.Status != "OK":
|
||||
agg.status = reading.Status
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
summary.PSUInW = benchmarkMean(inputTotals)
|
||||
summary.PSUOutW = benchmarkMean(outputTotals)
|
||||
summary.GPUSlotW = benchmarkMean(gpuSlotTotals)
|
||||
summary.Samples = len(samples)
|
||||
|
||||
if len(slotAgg) > 0 {
|
||||
summary.PSUSlots = make(map[string]BenchmarkPSUSlotPower, len(slotAgg))
|
||||
for slot, agg := range slotAgg {
|
||||
reading := BenchmarkPSUSlotPower{Status: agg.status}
|
||||
if mean := benchmarkMean(agg.inputs); mean > 0 {
|
||||
v := mean
|
||||
reading.InputW = &v
|
||||
}
|
||||
if mean := benchmarkMean(agg.outputs); mean > 0 {
|
||||
v := mean
|
||||
reading.OutputW = &v
|
||||
}
|
||||
summary.PSUSlots[slot] = reading
|
||||
}
|
||||
}
|
||||
if len(skippedSet) > 0 {
|
||||
summary.SkippedSensors = make([]string, 0, len(skippedSet))
|
||||
for skipped := range skippedSet {
|
||||
summary.SkippedSensors = append(summary.SkippedSensors, skipped)
|
||||
}
|
||||
sort.Strings(summary.SkippedSensors)
|
||||
}
|
||||
|
||||
return summary
|
||||
}
|
||||
|
||||
func collectIPMISDRPowerSeries(ctx context.Context, durationSec, intervalSec int) benchmarkSDRSeriesSummary {
|
||||
if durationSec <= 0 {
|
||||
return benchmarkSDRSeriesSummary{}
|
||||
}
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := startIPMISDRSampler(stopCh, intervalSec)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-time.After(time.Duration(durationSec) * time.Second):
|
||||
}
|
||||
close(stopCh)
|
||||
return summarizeSDRPowerSeries(<-doneCh)
|
||||
}
|
||||
|
||||
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||||
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||||
func queryIPMIServerPowerW() (float64, error) {
|
||||
@@ -3086,8 +3263,9 @@ func runBenchmarkPowerCalibration(
|
||||
logFunc func(string),
|
||||
seedLimits map[int]int,
|
||||
durationSec int,
|
||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
|
||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow, benchmarkPowerCalibrationRunSummary) {
|
||||
calibDurationSec := durationSec
|
||||
var runSummary benchmarkPowerCalibrationRunSummary
|
||||
if calibDurationSec <= 0 {
|
||||
calibDurationSec = 120
|
||||
}
|
||||
@@ -3105,12 +3283,12 @@ func runBenchmarkPowerCalibration(
|
||||
if engine == BenchmarkPowerEngineTargetedPower {
|
||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
|
||||
}
|
||||
} else {
|
||||
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
|
||||
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
|
||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
|
||||
}
|
||||
}
|
||||
if killed := KillTestWorkers(); len(killed) > 0 {
|
||||
@@ -3275,6 +3453,10 @@ calibDone:
|
||||
}
|
||||
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
||||
doneCh := make(chan sharedAttemptResult, 1)
|
||||
sdrStopCh := make(chan struct{})
|
||||
sdrDoneCh := startIPMISDRSampler(sdrStopCh, benchmarkPowerAutotuneSampleInterval)
|
||||
fanStopCh := make(chan struct{})
|
||||
fanDoneCh := startBenchmarkFanSampler(fanStopCh, benchmarkPowerAutotuneSampleInterval)
|
||||
go func() {
|
||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
|
||||
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
||||
@@ -3314,6 +3496,10 @@ calibDone:
|
||||
}
|
||||
ticker.Stop()
|
||||
cancelAttempt()
|
||||
close(sdrStopCh)
|
||||
close(fanStopCh)
|
||||
attemptSDRSummary := summarizeSDRPowerSeries(<-sdrDoneCh)
|
||||
attemptFanSummary := <-fanDoneCh
|
||||
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
||||
// Accumulate telemetry rows with attempt stage label.
|
||||
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
|
||||
@@ -3351,10 +3537,14 @@ calibDone:
|
||||
busyDelaySec = 1
|
||||
|
||||
// Per-GPU analysis and binary search update.
|
||||
attemptStable := ar.err == nil
|
||||
for _, s := range active {
|
||||
perGPU := filterRowsByGPU(ar.rows, s.idx)
|
||||
summary := summarizeBenchmarkTelemetry(perGPU)
|
||||
throttle := throttleReasons[s.idx]
|
||||
if throttle != "" || summary.P95PowerW <= 0 {
|
||||
attemptStable = false
|
||||
}
|
||||
|
||||
// Cooling warning: thermal throttle with fans not at maximum.
|
||||
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
|
||||
@@ -3487,6 +3677,16 @@ calibDone:
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
|
||||
}
|
||||
if attemptStable {
|
||||
if attemptSDRSummary.Samples > 0 {
|
||||
runSummary.LoadedSDR = attemptSDRSummary
|
||||
}
|
||||
if attemptFanSummary.FanSamples > 0 {
|
||||
runSummary.AvgFanRPM = attemptFanSummary.AvgFanRPM
|
||||
runSummary.AvgFanDutyCyclePct = attemptFanSummary.AvgFanDutyCyclePct
|
||||
runSummary.FanSamples = attemptFanSummary.FanSamples
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, s := range states {
|
||||
@@ -3495,7 +3695,7 @@ calibDone:
|
||||
}
|
||||
}
|
||||
writeBenchmarkMetricsFiles(runDir, allCalibRows)
|
||||
return results, restore, allCalibRows
|
||||
return results, restore, allCalibRows, runSummary
|
||||
}
|
||||
|
||||
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
||||
@@ -3540,6 +3740,47 @@ func meanFanRPM(fans []FanReading) float64 {
|
||||
return sum / float64(len(fans))
|
||||
}
|
||||
|
||||
func startBenchmarkFanSampler(stopCh <-chan struct{}, intervalSec int) <-chan benchmarkPowerCalibrationRunSummary {
|
||||
if intervalSec <= 0 {
|
||||
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||
}
|
||||
ch := make(chan benchmarkPowerCalibrationRunSummary, 1)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
var rpmSamples []float64
|
||||
var dutySamples []float64
|
||||
record := func() {
|
||||
fans, err := sampleFanSpeeds()
|
||||
if err != nil || len(fans) == 0 {
|
||||
return
|
||||
}
|
||||
if rpm := meanFanRPM(fans); rpm > 0 {
|
||||
rpmSamples = append(rpmSamples, rpm)
|
||||
}
|
||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok && duty > 0 {
|
||||
dutySamples = append(dutySamples, duty)
|
||||
}
|
||||
}
|
||||
record()
|
||||
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
ch <- benchmarkPowerCalibrationRunSummary{
|
||||
AvgFanRPM: benchmarkMean(rpmSamples),
|
||||
AvgFanDutyCyclePct: benchmarkMean(dutySamples),
|
||||
FanSamples: len(rpmSamples),
|
||||
}
|
||||
return
|
||||
case <-ticker.C:
|
||||
record()
|
||||
}
|
||||
}
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
func powerBenchDurationSec(profile string) int {
|
||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||
case NvidiaBenchmarkProfileStability:
|
||||
@@ -3568,41 +3809,39 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
|
||||
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||
fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W \n", sp.DeltaW)
|
||||
if sp.PSUInputLoadedW > 0 {
|
||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||
fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W \n", psuDelta)
|
||||
sourceLabel := "autotuned source"
|
||||
switch normalizeBenchmarkPowerSource(sp.Source) {
|
||||
case BenchmarkPowerSourceSDRPSUInput:
|
||||
sourceLabel = "autotuned source (SDR PSU AC input)"
|
||||
case BenchmarkPowerSourceDCMI:
|
||||
sourceLabel = "autotuned source (DCMI)"
|
||||
}
|
||||
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f \n", sp.ReportingRatio)
|
||||
fmt.Fprintf(&b, "**Server power delta (%s):** %.0f W \n", sourceLabel, sp.DeltaW)
|
||||
fmt.Fprintf(&b, "**Reporting ratio:** %.2f \n", sp.ReportingRatio)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
// Server power comparison table.
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
||||
selectedSource := normalizeBenchmarkPowerSource(sp.Source)
|
||||
selectedSourceLabel := "Selected source"
|
||||
if selectedSource == BenchmarkPowerSourceSDRPSUInput {
|
||||
selectedSourceLabel = "Selected source (SDR PSU AC input)"
|
||||
} else if selectedSource == BenchmarkPowerSourceDCMI {
|
||||
selectedSourceLabel = "Selected source (DCMI)"
|
||||
}
|
||||
var spRows [][]string
|
||||
spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
|
||||
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
|
||||
if sp.GPUSlotTotalW > 0 {
|
||||
spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
|
||||
}
|
||||
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
|
||||
if sp.Available {
|
||||
spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
|
||||
spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
|
||||
spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
|
||||
spRows = append(spRows, []string{selectedSourceLabel + " idle power", fmt.Sprintf("%.0f W", sp.IdleW)})
|
||||
spRows = append(spRows, []string{selectedSourceLabel + " loaded power", fmt.Sprintf("%.0f W", sp.LoadedW)})
|
||||
spRows = append(spRows, []string{selectedSourceLabel + " Δ power (loaded − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)})
|
||||
}
|
||||
if sp.PSUInputLoadedW > 0 {
|
||||
spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
|
||||
spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
|
||||
if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 {
|
||||
spRows = append(spRows, []string{"PSU AC input (idle avg, pre-load phase)", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
|
||||
spRows = append(spRows, []string{"PSU AC input (loaded avg, final phase)", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
|
||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||
spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
|
||||
}
|
||||
if sp.PSUOutputLoadedW > 0 {
|
||||
spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
|
||||
spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
|
||||
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
|
||||
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
|
||||
spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
|
||||
}
|
||||
spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", fmt.Sprintf("%.0f W", psuDelta)})
|
||||
}
|
||||
if sp.Available {
|
||||
ratio := sp.ReportingRatio
|
||||
@@ -3619,8 +3858,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
default:
|
||||
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
||||
}
|
||||
spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
|
||||
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
||||
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
|
||||
if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||
sdrRatio := psuDelta / sp.GPUReportedSumW
|
||||
sdrNote := ""
|
||||
@@ -3632,12 +3871,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
default:
|
||||
sdrNote = "✗ significant discrepancy"
|
||||
}
|
||||
spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
|
||||
spRows = append(spRows, []string{"PSU AC input reporting ratio", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
|
||||
}
|
||||
} else {
|
||||
spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
|
||||
spRows = append(spRows, []string{"IPMI availability", "not available — IPMI not supported or ipmitool not found"})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
|
||||
b.WriteString(fmtMDTable([]string{"Metric", "Value"}, spRows))
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, "\n> %s\n", note)
|
||||
}
|
||||
@@ -3689,11 +3928,10 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
psuDistRows = append(psuDistRows, []string{
|
||||
slot,
|
||||
fmtW(idle.InputW), fmtW(loaded.InputW),
|
||||
fmtW(idle.OutputW), fmtW(loaded.OutputW),
|
||||
deltaStr, status,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
|
||||
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle avg)", "AC Input (loaded avg)", "Load Δ", "Status"}, psuDistRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
@@ -3741,7 +3979,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
fan,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Avg Fan RPM (duty%)"}, sgRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
@@ -3850,7 +4088,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
for _, slot := range psuSlots {
|
||||
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
|
||||
}
|
||||
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
|
||||
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Avg Fan RPM (duty%)")
|
||||
|
||||
var psuRows [][]string
|
||||
for _, step := range result.RampSteps {
|
||||
@@ -3931,7 +4169,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
pdRows = append(pdRows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
|
||||
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
|
||||
fmt.Sprintf("%.0f W", stable),
|
||||
realization,
|
||||
@@ -3944,13 +4181,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
pdRows = append(pdRows, []string{
|
||||
"**Platform**",
|
||||
fmt.Sprintf("**%.0f W**", totalDefault),
|
||||
"—",
|
||||
fmt.Sprintf("**%.0f W**", totalStable),
|
||||
fmt.Sprintf("**%s**", platformReal),
|
||||
"",
|
||||
})
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
|
||||
b.WriteString("\n")
|
||||
|
||||
// Balance across GPUs — only meaningful with 2+ GPUs.
|
||||
@@ -4100,7 +4336,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
{"Avg Temp °C", singleTemp},
|
||||
{"Power W", singlePwr},
|
||||
{"Per GPU wall W", singleWall},
|
||||
{"Fan RPM (duty%)", singleFan},
|
||||
{"Avg Fan RPM (duty%)", singleFan},
|
||||
}
|
||||
if lastStep != nil {
|
||||
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
|
||||
@@ -4208,18 +4444,22 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
// Sample server idle power before any GPU load.
|
||||
var serverIdleW float64
|
||||
var serverIdleOK bool
|
||||
idleSDRStopCh := make(chan struct{})
|
||||
idleSDRCh := startIPMISDRSampler(idleSDRStopCh, benchmarkPowerAutotuneSampleInterval)
|
||||
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
|
||||
serverIdleW = w
|
||||
serverIdleOK = true
|
||||
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||
}
|
||||
sdrIdle := sampleIPMISDRPowerSensors()
|
||||
close(idleSDRStopCh)
|
||||
sdrIdle := summarizeSDRPowerSeries(<-idleSDRCh)
|
||||
psuBefore := psuStatusSnapshot()
|
||||
|
||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||
singleIPMILoadedW := make(map[int]float64, len(selected))
|
||||
singleRunSummaryByIndex := make(map[int]benchmarkPowerCalibrationRunSummary, len(selected))
|
||||
var allRestoreActions []benchmarkRestoreAction
|
||||
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
|
||||
var allPowerRows []GPUMetricRow
|
||||
@@ -4229,27 +4469,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
_ = os.MkdirAll(singleDir, 0755)
|
||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||
result.Findings = append(result.Findings,
|
||||
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
||||
return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||
singlePowerStopCh := make(chan struct{})
|
||||
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
||||
c, restore, singleRows, singleRun := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
||||
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||
close(singlePowerStopCh)
|
||||
sdrSingle := sampleIPMISDRPowerSensors()
|
||||
if samples := <-singlePowerCh; len(samples) > 0 {
|
||||
singleIPMILoadedW[idx] = benchmarkMean(samples)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
|
||||
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
|
||||
singleIPMILoadedW[idx] = sdrSingle.PSUInW
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
|
||||
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && singleRun.LoadedSDR.PSUInW > 0 {
|
||||
singleIPMILoadedW[idx] = singleRun.LoadedSDR.PSUInW
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR avg): %.0f W", idx, singleRun.LoadedSDR.PSUInW))
|
||||
}
|
||||
allRestoreActions = append(allRestoreActions, restore...)
|
||||
if r, ok := c[idx]; ok {
|
||||
calibByIndex[idx] = r
|
||||
}
|
||||
singleRunSummaryByIndex[idx] = singleRun
|
||||
}
|
||||
defer func() {
|
||||
for i := len(allRestoreActions) - 1; i >= 0; i-- {
|
||||
@@ -4292,11 +4531,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
||||
gpu.Telemetry = &t
|
||||
}
|
||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
||||
gpu.AvgFanRPM = meanFanRPM(fans)
|
||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
||||
gpu.AvgFanDutyCyclePct = duty
|
||||
}
|
||||
if singleRun := singleRunSummaryByIndex[idx]; singleRun.AvgFanRPM > 0 {
|
||||
gpu.AvgFanRPM = singleRun.AvgFanRPM
|
||||
gpu.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
|
||||
}
|
||||
gpus = append(gpus, gpu)
|
||||
}
|
||||
@@ -4352,10 +4589,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
|
||||
var serverLoadedW float64
|
||||
var serverLoadedOK bool
|
||||
// sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are
|
||||
// still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling
|
||||
// after the test when GPUs have already returned to idle.
|
||||
var sdrLastStep sdrPowerSnapshot
|
||||
// sdrLastStep retains the phase-averaged SDR readings from the last ramp step
|
||||
// while GPUs are loaded. Used in the summary instead of re-sampling after the
|
||||
// test when GPUs have already returned to idle.
|
||||
var sdrLastStep benchmarkSDRSeriesSummary
|
||||
|
||||
// Step 1: reuse single-card calibration result directly.
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
@@ -4376,6 +4613,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
ramp.ServerLoadedW = w
|
||||
ramp.ServerDeltaW = w - serverIdleW
|
||||
}
|
||||
if singleRun := singleRunSummaryByIndex[firstIdx]; singleRun.AvgFanRPM > 0 {
|
||||
ramp.AvgFanRPM = singleRun.AvgFanRPM
|
||||
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
|
||||
}
|
||||
if !firstCalib.Completed {
|
||||
ramp.Status = "FAILED"
|
||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||
@@ -4426,7 +4667,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
stepPowerStopCh := make(chan struct{})
|
||||
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
||||
stepCalib, stepRestore, stepRows, stepRun := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
||||
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||
close(stepPowerStopCh)
|
||||
var stepIPMILoadedW float64
|
||||
@@ -4497,10 +4738,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||
}
|
||||
|
||||
// Per-step PSU slot snapshot — also used as the authoritative loaded power
|
||||
// source when SDR PSU sensors are available (more accurate than DCMI on
|
||||
// servers where DCMI covers only a subset of installed PSUs).
|
||||
sdrStep := sampleIPMISDRPowerSensors()
|
||||
// Per-step PSU slot readings are averaged over the whole load phase rather
|
||||
// than captured as a single end-of-phase snapshot.
|
||||
sdrStep := stepRun.LoadedSDR
|
||||
if len(sdrStep.PSUSlots) > 0 {
|
||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||
}
|
||||
@@ -4518,7 +4758,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
|
||||
ramp.ServerLoadedW = sdrStep.PSUInW
|
||||
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
||||
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
|
||||
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR avg): %.0f W", step, sdrStep.PSUInW))
|
||||
if step == len(result.RecommendedSlotOrder) {
|
||||
serverLoadedW = sdrStep.PSUInW
|
||||
serverLoadedOK = true
|
||||
@@ -4526,12 +4766,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
}
|
||||
|
||||
// Fan state at end of ramp step.
|
||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
||||
ramp.AvgFanRPM = meanFanRPM(fans)
|
||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
||||
ramp.AvgFanDutyCyclePct = duty
|
||||
}
|
||||
// Fan values are phase averages over the same load window.
|
||||
if stepRun.AvgFanRPM > 0 {
|
||||
ramp.AvgFanRPM = stepRun.AvgFanRPM
|
||||
ramp.AvgFanDutyCyclePct = stepRun.AvgFanDutyCyclePct
|
||||
}
|
||||
|
||||
// Per-GPU telemetry from this ramp step's calibration.
|
||||
@@ -4584,8 +4822,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
||||
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
||||
if result.ServerPower != nil {
|
||||
// Use the SDR snapshot from the last ramp step (GPUs still loaded) rather
|
||||
// than re-sampling here, which would capture post-test idle state.
|
||||
// Use the SDR phase average from the last ramp step (GPUs still loaded)
|
||||
// rather than re-sampling here, which would capture post-test idle state.
|
||||
sdrLoaded := sdrLastStep
|
||||
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
|
||||
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
|
||||
@@ -4605,6 +4843,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
result.ServerPower.Notes = append(result.ServerPower.Notes,
|
||||
"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
|
||||
}
|
||||
if sdrLoaded.Samples > 0 {
|
||||
result.ServerPower.Notes = append(result.ServerPower.Notes,
|
||||
fmt.Sprintf("Final SDR PSU loaded values are phase averages across %d sample(s) from the last full-load step.", sdrLoaded.Samples))
|
||||
}
|
||||
// Detect DCMI partial coverage: direct SDR comparison first,
|
||||
// ramp heuristic as fallback when SDR PSU sensors are absent.
|
||||
dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
|
||||
|
||||
@@ -2,7 +2,7 @@ package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
@@ -188,18 +188,16 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldExec := satExecCommand
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 1000 }
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
t.Fatalf("unexpected command: %s %v", name, args)
|
||||
return nil
|
||||
benchmarkResetNvidiaGPU = func(int) (string, error) {
|
||||
t.Fatal("unexpected reset call")
|
||||
return "", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
satExecCommand = oldExec
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
var logs []string
|
||||
@@ -215,44 +213,52 @@ func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "nvidia-smi")
|
||||
argsLog := filepath.Join(dir, "args.log")
|
||||
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
||||
t.Fatalf("write script: %v", err)
|
||||
}
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldLookPath := satLookPath
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "nvidia-smi" {
|
||||
return script, nil
|
||||
}
|
||||
return exec.LookPath(file)
|
||||
var calls []int
|
||||
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||
calls = append(calls, index)
|
||||
return "ok\n", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
satLookPath = oldLookPath
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||
if len(failed) != 0 {
|
||||
t.Fatalf("failed=%v want no failures", failed)
|
||||
}
|
||||
raw, err := os.ReadFile(argsLog)
|
||||
if err != nil {
|
||||
t.Fatalf("read args log: %v", err)
|
||||
if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
|
||||
t.Fatalf("calls=%v want %s", calls, want)
|
||||
}
|
||||
got := strings.Fields(string(raw))
|
||||
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
||||
if strings.Join(got, " ") != strings.Join(want, " ") {
|
||||
t.Fatalf("args=%v want %v", got, want)
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||
if index == 5 {
|
||||
return "busy\n", exec.ErrNotFound
|
||||
}
|
||||
return "ok\n", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||
if got, want := fmt.Sprint(failed), "[5]"; got != want {
|
||||
t.Fatalf("failed=%v want %s", failed, want)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ var workerPatterns = []string{
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
"nvbandwidth",
|
||||
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||
"nvvs",
|
||||
@@ -71,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
|
||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||
base = exe[idx+1:]
|
||||
}
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
break
|
||||
}
|
||||
if shouldKillWorkerProcess(exe, base) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
}
|
||||
}
|
||||
return killed
|
||||
}
|
||||
|
||||
func shouldKillWorkerProcess(exe, base string) bool {
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
39
audit/internal/platform/kill_workers_test.go
Normal file
39
audit/internal/platform/kill_workers_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestShouldKillWorkerProcess(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
exe string
|
||||
base string
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "nvbandwidth executable",
|
||||
exe: "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
|
||||
base: "nvbandwidth",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "dcgmi executable",
|
||||
exe: "/usr/bin/dcgmi",
|
||||
base: "dcgmi",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "unrelated process",
|
||||
exe: "/usr/bin/bash",
|
||||
base: "bash",
|
||||
want: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
|
||||
t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,8 @@ package platform
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -28,3 +30,22 @@ func runNvidiaRecover(args ...string) (string, error) {
|
||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
func resetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
|
||||
func restartNvidiaDrivers() (string, error) {
|
||||
out, err := runNvidiaRecover("restart-drivers")
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "NVIDIA drivers restarted.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
|
||||
@@ -30,10 +30,10 @@ import (
|
||||
// Sources:
|
||||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
||||
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
||||
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
||||
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
||||
// - SATEstimatedNvidiaGPUValidateSec: xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaGPUStressSec: xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaTargetedStressSec: xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaTargetedPowerSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||
@@ -48,15 +48,15 @@ const (
|
||||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||
SATEstimatedMemoryStressSec = 140
|
||||
|
||||
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
||||
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
||||
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
||||
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
||||
// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
|
||||
SATEstimatedNvidiaGPUValidateSec = 85
|
||||
// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
|
||||
SATEstimatedNvidiaGPUStressSec = 450
|
||||
|
||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
||||
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
||||
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
||||
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaTargetedStressSec = 350
|
||||
// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaTargetedPowerSec = 350
|
||||
|
||||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||
SATEstimatedNvidiaPulseTestSec = 5000
|
||||
@@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string {
|
||||
}
|
||||
|
||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return out, err
|
||||
return resetNvidiaGPU(index)
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
|
||||
@@ -62,7 +62,7 @@ func (s *System) ServiceState(name string) string {
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
if name == "bee-nvidia" && action == ServiceRestart {
|
||||
return runNvidiaRecover("restart-drivers")
|
||||
return restartNvidiaDrivers()
|
||||
}
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
|
||||
@@ -806,15 +806,14 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
if t.job == nil || !t.job.abort() {
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "not_running"})
|
||||
return
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "aborting"})
|
||||
return
|
||||
}
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "aborted"})
|
||||
|
||||
@@ -20,7 +20,7 @@ type jobState struct {
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
serialPrefix string
|
||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||
logBuf *bufio.Writer
|
||||
}
|
||||
|
||||
@@ -53,13 +53,21 @@ func (j *jobState) abort() bool {
|
||||
}
|
||||
|
||||
func (j *jobState) append(line string) {
|
||||
j.appendWithOptions(line, true, true)
|
||||
}
|
||||
|
||||
func (j *jobState) appendFromLog(line string) {
|
||||
j.appendWithOptions(line, false, false)
|
||||
}
|
||||
|
||||
func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
j.lines = append(j.lines, line)
|
||||
if j.logPath != "" {
|
||||
if persistLog && j.logPath != "" {
|
||||
j.writeLogLineLocked(line)
|
||||
}
|
||||
if j.serialPrefix != "" {
|
||||
if serialMirror && j.serialPrefix != "" {
|
||||
taskSerialWriteLine(j.serialPrefix + line)
|
||||
}
|
||||
for _, ch := range j.subs {
|
||||
|
||||
@@ -207,7 +207,7 @@ func renderInstall() string {
|
||||
func renderTasks() string {
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
|
||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
|
||||
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
||||
</div>
|
||||
@@ -289,7 +289,7 @@ function cancelAll() {
|
||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function killWorkers() {
|
||||
if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
|
||||
if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
|
||||
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||
.then(r=>r.json())
|
||||
.then(d=>{
|
||||
|
||||
@@ -35,9 +35,11 @@ func validateTotalValidateSec(n int) int {
|
||||
}
|
||||
total := platform.SATEstimatedCPUValidateSec +
|
||||
platform.SATEstimatedMemoryValidateSec +
|
||||
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
if n > 0 {
|
||||
total += platform.SATEstimatedNvidiaGPUValidateSec
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
@@ -47,12 +49,14 @@ func validateTotalStressSec(n int) int {
|
||||
}
|
||||
total := platform.SATEstimatedCPUStressSec +
|
||||
platform.SATEstimatedMemoryStressSec +
|
||||
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
||||
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
||||
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
||||
platform.SATEstimatedNvidiaPulseTestSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
if n > 0 {
|
||||
total += platform.SATEstimatedNvidiaGPUStressSec +
|
||||
platform.SATEstimatedNvidiaTargetedStressSec +
|
||||
platform.SATEstimatedNvidiaTargetedPowerSec
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
@@ -128,33 +132,16 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
func() string {
|
||||
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
||||
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
||||
if n > 0 {
|
||||
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
||||
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
||||
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
||||
}
|
||||
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
||||
validateFmtDur(perV), validateFmtDur(perS))
|
||||
}(),
|
||||
fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||
)) +
|
||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
func() string {
|
||||
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
||||
s := "Skipped in Validate. "
|
||||
if n > 0 {
|
||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||
} else {
|
||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||
}
|
||||
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||
}(),
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-targeted-power">` +
|
||||
@@ -162,16 +149,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
func() string {
|
||||
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
||||
s := "Skipped in Validate. "
|
||||
if n > 0 {
|
||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||
} else {
|
||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||
}
|
||||
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||
}(),
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-pulse">` +
|
||||
@@ -382,8 +360,8 @@ function runSATWithOverrides(target, overrides) {
|
||||
return enqueueSATTarget(target, overrides)
|
||||
.then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
|
||||
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const nvidiaPerGPUTargets = [];
|
||||
const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
function satAllGPUIndicesForMulti() {
|
||||
return Promise.resolve(satSelectedGPUIndices());
|
||||
}
|
||||
@@ -417,40 +395,9 @@ function runNvidiaFabricValidate(target) {
|
||||
});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
return loadSatNvidiaGPUs().then(gpus => {
|
||||
const selected = satSelectedGPUIndices();
|
||||
const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
|
||||
if (!picked.length) {
|
||||
throw new Error('Select at least one NVIDIA GPU.');
|
||||
}
|
||||
if (picked.length === 1) {
|
||||
const gpu = picked[0];
|
||||
return runSATWithOverrides(target, {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
|
||||
});
|
||||
}
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + target;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Running ' + target + ' one GPU at a time...\n';
|
||||
const labelBase = satLabels()[target] || ('Validate ' + target);
|
||||
const runNext = (idx) => {
|
||||
if (idx >= picked.length) return Promise.resolve();
|
||||
const gpu = picked[idx];
|
||||
const gpuLabel = satGPUDisplayName(gpu);
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
|
||||
return enqueueSATTarget(target, {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
display_name: labelBase + ' (' + gpuLabel + ')',
|
||||
}).then(d => {
|
||||
return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
|
||||
}).then(function() {
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
});
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||
return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
|
||||
505
audit/internal/webui/task_runner.go
Normal file
505
audit/internal/webui/task_runner.go
Normal file
@@ -0,0 +1,505 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
)
|
||||
|
||||
type taskRunnerState struct {
|
||||
PID int `json:"pid"`
|
||||
Status string `json:"status"`
|
||||
Error string `json:"error,omitempty"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
func taskRunnerStatePath(t *Task) string {
|
||||
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(t.ArtifactsDir, "runner-state.json")
|
||||
}
|
||||
|
||||
func writeTaskRunnerState(t *Task, state taskRunnerState) error {
|
||||
path := taskRunnerStatePath(t)
|
||||
if path == "" {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
data, err := json.MarshalIndent(state, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.Rename(tmp, path)
|
||||
}
|
||||
|
||||
func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
|
||||
path := taskRunnerStatePath(t)
|
||||
if path == "" {
|
||||
return taskRunnerState{}, false
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil || len(data) == 0 {
|
||||
return taskRunnerState{}, false
|
||||
}
|
||||
var state taskRunnerState
|
||||
if err := json.Unmarshal(data, &state); err != nil {
|
||||
return taskRunnerState{}, false
|
||||
}
|
||||
return state, true
|
||||
}
|
||||
|
||||
func processAlive(pid int) bool {
|
||||
if pid <= 0 {
|
||||
return false
|
||||
}
|
||||
err := syscall.Kill(pid, 0)
|
||||
return err == nil || err == syscall.EPERM
|
||||
}
|
||||
|
||||
func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
switch {
|
||||
case cancelled:
|
||||
t.Status = TaskCancelled
|
||||
t.ErrMsg = "aborted"
|
||||
case strings.TrimSpace(errMsg) != "":
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = errMsg
|
||||
default:
|
||||
t.Status = TaskDone
|
||||
t.ErrMsg = ""
|
||||
}
|
||||
}
|
||||
|
||||
func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
|
||||
if opts == nil {
|
||||
j.append("ERROR: handler options not configured")
|
||||
j.finish("handler options not configured")
|
||||
return
|
||||
}
|
||||
a := opts.App
|
||||
|
||||
recovered := len(j.lines) > 0
|
||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||
if recovered {
|
||||
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||
}
|
||||
|
||||
var (
|
||||
archive string
|
||||
err error
|
||||
)
|
||||
|
||||
switch t.Target {
|
||||
case "nvidia":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
diagLevel := 2
|
||||
if t.params.StressMode {
|
||||
diagLevel = 3
|
||||
}
|
||||
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||
result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
archive = result.Body
|
||||
}
|
||||
} else {
|
||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||
}
|
||||
case "nvidia-targeted-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if dur <= 0 {
|
||||
dur = 300
|
||||
}
|
||||
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-bench-perf":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
SizeMB: t.params.SizeMB,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-bench-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-bench-autotune":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
SizeMB: t.params.SizeMB,
|
||||
}, t.params.BenchmarkKind, j.append)
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
if rampPlan.StaggerSeconds > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
|
||||
case "nvidia-targeted-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-pulse":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-bandwidth":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
|
||||
case "nvidia-interconnect":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||
case "nvidia-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
if rampPlan.StaggerSeconds > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
StaggerSeconds: rampPlan.StaggerSeconds,
|
||||
}, j.append)
|
||||
case "memory":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
|
||||
j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||
case "storage":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||
case "cpu":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
if dur <= 0 {
|
||||
if t.params.StressMode {
|
||||
dur = 1800
|
||||
} else {
|
||||
dur = 60
|
||||
}
|
||||
}
|
||||
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||
case "amd":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "amd-mem":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||
case "amd-bandwidth":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||
case "amd-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "memory-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "sat-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "platform-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||
runOpts.Components = t.params.PlatformComponents
|
||||
archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
|
||||
case "audit":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
result, e := a.RunAuditNow(opts.RuntimeMode)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
for _, line := range splitLines(result.Body) {
|
||||
j.append(line)
|
||||
}
|
||||
}
|
||||
case "support-bundle":
|
||||
j.append("Building support bundle...")
|
||||
archive, err = buildSupportBundle(opts.ExportDir)
|
||||
case "install":
|
||||
if strings.TrimSpace(t.params.Device) == "" {
|
||||
err = fmt.Errorf("device is required")
|
||||
break
|
||||
}
|
||||
installLogPath := platform.InstallLogPath(t.params.Device)
|
||||
j.append("Install log: " + installLogPath)
|
||||
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
|
||||
case "install-to-ram":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
err = a.RunInstallToRAM(ctx, j.append)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
return
|
||||
}
|
||||
|
||||
if archive != "" {
|
||||
archivePath := app.ExtractArchivePath(archive)
|
||||
if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||
}
|
||||
if opts.App != nil && opts.App.StatusDB != nil {
|
||||
app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
j.finish("aborted")
|
||||
} else {
|
||||
j.append("ERROR: " + err.Error())
|
||||
j.finish(err.Error())
|
||||
}
|
||||
return
|
||||
}
|
||||
if archive != "" {
|
||||
j.append("Archive: " + archive)
|
||||
}
|
||||
j.finish("")
|
||||
}
|
||||
|
||||
func loadPersistedTask(statePath, taskID string) (*Task, error) {
|
||||
data, err := os.ReadFile(statePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var persisted []persistedTask
|
||||
if err := json.Unmarshal(data, &persisted); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, pt := range persisted {
|
||||
if pt.ID != taskID {
|
||||
continue
|
||||
}
|
||||
t := &Task{
|
||||
ID: pt.ID,
|
||||
Name: pt.Name,
|
||||
Target: pt.Target,
|
||||
Priority: pt.Priority,
|
||||
Status: pt.Status,
|
||||
CreatedAt: pt.CreatedAt,
|
||||
StartedAt: pt.StartedAt,
|
||||
DoneAt: pt.DoneAt,
|
||||
ErrMsg: pt.ErrMsg,
|
||||
LogPath: pt.LogPath,
|
||||
ArtifactsDir: pt.ArtifactsDir,
|
||||
ReportJSONPath: pt.ReportJSONPath,
|
||||
ReportHTMLPath: pt.ReportHTMLPath,
|
||||
params: pt.Params,
|
||||
}
|
||||
ensureTaskReportPaths(t)
|
||||
return t, nil
|
||||
}
|
||||
return nil, fmt.Errorf("task %s not found", taskID)
|
||||
}
|
||||
|
||||
func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
|
||||
if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
|
||||
fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
|
||||
return 2
|
||||
}
|
||||
|
||||
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||
if err != nil {
|
||||
slog.Warn("resolve runtime for task-run", "err", err)
|
||||
}
|
||||
opts := &HandlerOptions{
|
||||
ExportDir: exportDir,
|
||||
App: app.New(platform.New()),
|
||||
RuntimeMode: runtimeInfo.Mode,
|
||||
}
|
||||
statePath := filepath.Join(exportDir, "tasks-state.json")
|
||||
task, err := loadPersistedTask(statePath, taskID)
|
||||
if err != nil {
|
||||
fmt.Fprintln(stderr, err.Error())
|
||||
return 1
|
||||
}
|
||||
if task.StartedAt == nil || task.StartedAt.IsZero() {
|
||||
now := time.Now()
|
||||
task.StartedAt = &now
|
||||
}
|
||||
if task.Status == "" {
|
||||
task.Status = TaskRunning
|
||||
}
|
||||
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||
PID: os.Getpid(),
|
||||
Status: TaskRunning,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
fmt.Fprintln(stderr, err.Error())
|
||||
return 1
|
||||
}
|
||||
|
||||
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer cancel()
|
||||
|
||||
j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||
executeTaskWithOptions(opts, task, j, ctx)
|
||||
finalizeTaskForResult(task, j.err, ctx.Err() != nil)
|
||||
if err := writeTaskReportArtifacts(task); err != nil {
|
||||
appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||
}
|
||||
j.closeLog()
|
||||
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||
PID: os.Getpid(),
|
||||
Status: task.Status,
|
||||
Error: task.ErrMsg,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
fmt.Fprintln(stderr, err.Error())
|
||||
}
|
||||
if task.ErrMsg != "" {
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"os"
|
||||
@@ -13,6 +14,7 @@ import (
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
@@ -110,8 +112,9 @@ type Task struct {
|
||||
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
||||
|
||||
// runtime fields (not serialised)
|
||||
job *jobState
|
||||
params taskParams
|
||||
job *jobState
|
||||
runnerPID int
|
||||
params taskParams
|
||||
}
|
||||
|
||||
// taskParams holds optional parameters parsed from the run request.
|
||||
@@ -328,6 +331,13 @@ var (
|
||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
||||
}
|
||||
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return exec.Command(exe, "bee-worker", "--export-dir", exportDir, "--task-id", taskID), nil
|
||||
}
|
||||
)
|
||||
|
||||
// enqueue adds a task to the queue and notifies the worker.
|
||||
@@ -365,6 +375,11 @@ func (q *taskQueue) prune() {
|
||||
|
||||
// nextPending returns the highest-priority pending task (nil if none).
|
||||
func (q *taskQueue) nextPending() *Task {
|
||||
for _, t := range q.tasks {
|
||||
if t.Status == TaskRunning {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
var best *Task
|
||||
for _, t := range q.tasks {
|
||||
if t.Status != TaskPending {
|
||||
@@ -484,6 +499,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
||||
if !q.started {
|
||||
q.loadLocked()
|
||||
q.started = true
|
||||
q.resumeRunningTasksLocked()
|
||||
goRecoverLoop("task worker", 2*time.Second, q.worker)
|
||||
}
|
||||
hasPending := q.nextPending() != nil
|
||||
@@ -517,15 +533,12 @@ func (q *taskQueue) worker() {
|
||||
t.StartedAt = &now
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
||||
j := newTaskJobState(t.LogPath)
|
||||
t.job = j
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||
j.cancel = taskCancel
|
||||
q.executeTask(t, j, taskCtx)
|
||||
taskCancel()
|
||||
q.runTaskExternal(t, j)
|
||||
|
||||
q.mu.Lock()
|
||||
q.prune()
|
||||
@@ -537,6 +550,207 @@ func (q *taskQueue) worker() {
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) resumeRunningTasksLocked() {
|
||||
for _, t := range q.tasks {
|
||||
if t.Status != TaskRunning {
|
||||
continue
|
||||
}
|
||||
if t.job == nil {
|
||||
t.job = newTaskJobState(t.LogPath)
|
||||
}
|
||||
q.attachExternalTaskControlsLocked(t, t.job)
|
||||
q.startRecoveredTaskMonitorLocked(t, t.job)
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) attachExternalTaskControlsLocked(t *Task, j *jobState) {
|
||||
if t == nil || j == nil {
|
||||
return
|
||||
}
|
||||
j.cancel = func() {
|
||||
pid := t.runnerPID
|
||||
if pid <= 0 {
|
||||
if state, ok := readTaskRunnerState(t); ok {
|
||||
pid = state.PID
|
||||
}
|
||||
}
|
||||
if pid > 0 {
|
||||
_ = syscall.Kill(pid, syscall.SIGTERM)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
|
||||
if t == nil || j == nil || t.runnerPID <= 0 {
|
||||
return
|
||||
}
|
||||
goRecoverOnce("task runner monitor", func() {
|
||||
stopTail := make(chan struct{})
|
||||
doneTail := make(chan struct{})
|
||||
go q.followTaskLog(t, j, stopTail, doneTail)
|
||||
for processAlive(t.runnerPID) {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
}
|
||||
close(stopTail)
|
||||
<-doneTail
|
||||
q.finishExternalTask(t, j, nil)
|
||||
})
|
||||
}
|
||||
|
||||
func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
|
||||
stopTail := make(chan struct{})
|
||||
doneTail := make(chan struct{})
|
||||
defer func() {
|
||||
close(stopTail)
|
||||
<-doneTail
|
||||
}()
|
||||
go q.followTaskLog(t, j, stopTail, doneTail)
|
||||
|
||||
cmd, err := externalTaskRunnerCommand(q.opts.ExportDir, t.ID)
|
||||
if err != nil {
|
||||
j.appendFromLog("ERROR: " + err.Error())
|
||||
q.finishExternalTask(t, j, err)
|
||||
return
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
j.appendFromLog("ERROR: " + err.Error())
|
||||
q.finishExternalTask(t, j, err)
|
||||
return
|
||||
}
|
||||
|
||||
q.mu.Lock()
|
||||
t.runnerPID = cmd.Process.Pid
|
||||
q.attachExternalTaskControlsLocked(t, j)
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
waitErr := cmd.Wait()
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
q.finishExternalTask(t, j, waitErr)
|
||||
}
|
||||
|
||||
func (q *taskQueue) followTaskLog(t *Task, j *jobState, stop <-chan struct{}, done chan<- struct{}) {
|
||||
defer close(done)
|
||||
path := ""
|
||||
if t != nil {
|
||||
path = t.LogPath
|
||||
}
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return
|
||||
}
|
||||
offset := int64(0)
|
||||
if info, err := os.Stat(path); err == nil {
|
||||
offset = info.Size()
|
||||
}
|
||||
var partial string
|
||||
ticker := time.NewTicker(250 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
flush := func() {
|
||||
data, newOffset, err := readTaskLogDelta(path, offset)
|
||||
if err != nil || len(data) == 0 {
|
||||
offset = newOffset
|
||||
return
|
||||
}
|
||||
offset = newOffset
|
||||
text := partial + strings.ReplaceAll(string(data), "\r\n", "\n")
|
||||
lines := strings.Split(text, "\n")
|
||||
partial = lines[len(lines)-1]
|
||||
for _, line := range lines[:len(lines)-1] {
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
j.appendFromLog(line)
|
||||
}
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
flush()
|
||||
case <-stop:
|
||||
flush()
|
||||
if strings.TrimSpace(partial) != "" {
|
||||
j.appendFromLog(partial)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func readTaskLogDelta(path string, offset int64) ([]byte, int64, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, offset, err
|
||||
}
|
||||
defer f.Close()
|
||||
info, err := f.Stat()
|
||||
if err != nil {
|
||||
return nil, offset, err
|
||||
}
|
||||
if info.Size() < offset {
|
||||
offset = 0
|
||||
}
|
||||
if _, err := f.Seek(offset, io.SeekStart); err != nil {
|
||||
return nil, offset, err
|
||||
}
|
||||
data, err := io.ReadAll(io.LimitReader(f, 1<<20))
|
||||
return data, offset + int64(len(data)), err
|
||||
}
|
||||
|
||||
func (q *taskQueue) finishExternalTask(t *Task, j *jobState, waitErr error) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
if t.Status == TaskDone || t.Status == TaskFailed || t.Status == TaskCancelled {
|
||||
if j != nil && !j.isDone() {
|
||||
j.finish(t.ErrMsg)
|
||||
j.closeLog()
|
||||
}
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
state, ok := readTaskRunnerState(t)
|
||||
switch {
|
||||
case ok && state.Status != TaskRunning:
|
||||
t.Status = state.Status
|
||||
t.ErrMsg = state.Error
|
||||
now := state.UpdatedAt
|
||||
if now.IsZero() {
|
||||
now = time.Now()
|
||||
}
|
||||
t.DoneAt = &now
|
||||
case waitErr != nil:
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = waitErr.Error()
|
||||
t.DoneAt = &now
|
||||
default:
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = "task runner exited without final state"
|
||||
t.DoneAt = &now
|
||||
}
|
||||
t.runnerPID = 0
|
||||
q.finalizeTaskArtifactPathsLocked(t)
|
||||
q.persistLocked()
|
||||
|
||||
if j != nil && !j.isDone() {
|
||||
j.finish(t.ErrMsg)
|
||||
j.closeLog()
|
||||
}
|
||||
if t.ErrMsg != "" {
|
||||
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
||||
} else {
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
}
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
|
||||
startedKmsgWatch := false
|
||||
defer q.finalizeTaskRun(t, j)
|
||||
@@ -985,15 +1199,11 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
if t.job == nil || !t.job.abort() {
|
||||
writeError(w, http.StatusConflict, "task is not cancellable")
|
||||
return
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.persistLocked()
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
writeJSON(w, map[string]string{"status": "aborting"})
|
||||
default:
|
||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||
}
|
||||
@@ -1039,12 +1249,6 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
n++
|
||||
}
|
||||
}
|
||||
@@ -1175,18 +1379,29 @@ func (q *taskQueue) loadLocked() {
|
||||
}
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskRunning {
|
||||
// The task was interrupted by a bee-web restart. Child processes
|
||||
// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
|
||||
// their own process groups. Kill any matching stale workers before
|
||||
// marking the task failed so the next GPU test does not inherit a
|
||||
// busy DCGM slot or duplicate workers.
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
_ = platform.KillTestWorkers()
|
||||
state, ok := readTaskRunnerState(t)
|
||||
switch {
|
||||
case ok && state.Status == TaskRunning && processAlive(state.PID):
|
||||
t.runnerPID = state.PID
|
||||
t.job = newTaskJobState(t.LogPath)
|
||||
case ok && state.Status != TaskRunning:
|
||||
t.runnerPID = state.PID
|
||||
t.Status = state.Status
|
||||
t.ErrMsg = state.Error
|
||||
now := state.UpdatedAt
|
||||
if now.IsZero() {
|
||||
now = time.Now()
|
||||
}
|
||||
t.DoneAt = &now
|
||||
default:
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
_ = platform.KillTestWorkers()
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
t.ErrMsg = "interrupted by bee-web restart"
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
t.ErrMsg = "interrupted by bee-web restart"
|
||||
} else if t.Status == TaskPending {
|
||||
t.StartedAt = nil
|
||||
t.DoneAt = nil
|
||||
|
||||
@@ -126,6 +126,37 @@ resolve_iso_version() {
|
||||
resolve_audit_version
|
||||
}
|
||||
|
||||
sync_builder_workdir() {
|
||||
src_dir="$1"
|
||||
dst_dir="$2"
|
||||
|
||||
mkdir -p "$dst_dir"
|
||||
|
||||
# Historical bug: old workdirs could keep config/bootloaders/grub-pc even
|
||||
# after the source tree moved to grub-efi only. Remove bootloaders eagerly
|
||||
# so reused workdirs cannot leak stale templates into a new ISO build.
|
||||
rm -rf "$dst_dir/config/bootloaders"
|
||||
|
||||
rsync -a --delete \
|
||||
--exclude='cache/' \
|
||||
--exclude='chroot/' \
|
||||
--exclude='.build/' \
|
||||
--exclude='*.iso' \
|
||||
--exclude='*.packages' \
|
||||
--exclude='*.contents' \
|
||||
--exclude='*.files' \
|
||||
"$src_dir/" "$dst_dir/"
|
||||
|
||||
if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
|
||||
echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
|
||||
echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
iso_list_files() {
|
||||
iso_path="$1"
|
||||
|
||||
@@ -466,6 +497,75 @@ validate_iso_memtest() {
|
||||
echo "=== memtest validation OK ==="
|
||||
}
|
||||
|
||||
validate_iso_live_boot_entries() {
|
||||
iso_path="$1"
|
||||
echo "=== validating live boot entries in ISO ==="
|
||||
|
||||
[ -f "$iso_path" ] || {
|
||||
echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
|
||||
exit 1
|
||||
}
|
||||
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
|
||||
echo "ERROR: ISO reader unavailable for live boot validation" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
grub_cfg="$(mktemp)"
|
||||
isolinux_cfg="$(mktemp)"
|
||||
|
||||
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||
echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||
echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
|
||||
echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB toram entry is missing" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'linux .*boot=live ' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB live entry is missing boot=live" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
grep -q 'append .*boot=live ' "$isolinux_cfg" || {
|
||||
echo "ERROR: isolinux live entry is missing boot=live" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
|
||||
echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
echo "=== live boot validation OK ==="
|
||||
}
|
||||
|
||||
validate_iso_nvidia_runtime() {
|
||||
iso_path="$1"
|
||||
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
||||
@@ -558,6 +658,21 @@ extract_live_grub_entry() {
|
||||
return 0
|
||||
}
|
||||
|
||||
load_live_build_append() {
|
||||
lb_dir="$1"
|
||||
binary_cfg="$lb_dir/config/binary"
|
||||
[ -f "$binary_cfg" ] || return 1
|
||||
|
||||
# config/binary is generated by live-build and contains shell variable
|
||||
# assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
|
||||
# shellcheck disable=SC1090
|
||||
. "$binary_cfg"
|
||||
|
||||
[ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
|
||||
live_build_append="$LB_BOOTAPPEND_LIVE"
|
||||
return 0
|
||||
}
|
||||
|
||||
extract_live_isolinux_entry() {
|
||||
cfg="$1"
|
||||
isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||
@@ -594,36 +709,15 @@ echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
linux ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd ${initrd}
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — load to RAM (toram)" {
|
||||
linux ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
menuentry "EASY-BEE -- load to RAM (toram)" {
|
||||
linux ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd ${initrd}
|
||||
}
|
||||
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — GSP=off" {
|
||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd ${initrd}
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||
linux ${kernel} ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd ${initrd}
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — KMS + GSP=off" {
|
||||
linux ${kernel} ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd ${initrd}
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — fail-safe" {
|
||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd ${initrd}
|
||||
}
|
||||
}
|
||||
|
||||
if [ "\${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
@@ -699,13 +793,18 @@ enforce_live_build_bootloader_assets() {
|
||||
grub_dir="$lb_dir/binary/boot/grub"
|
||||
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
||||
|
||||
if ! load_live_build_append "$lb_dir"; then
|
||||
echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
|
||||
live_build_append=""
|
||||
fi
|
||||
|
||||
if [ -f "$grub_cfg" ]; then
|
||||
if extract_live_grub_entry "$grub_cfg"; then
|
||||
mkdir -p "$grub_dir/live-theme"
|
||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
|
||||
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
|
||||
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "$grub_append" "$grub_initrd"
|
||||
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
|
||||
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
||||
else
|
||||
echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
|
||||
@@ -714,7 +813,7 @@ enforce_live_build_bootloader_assets() {
|
||||
|
||||
if [ -f "$isolinux_cfg" ]; then
|
||||
if extract_live_isolinux_entry "$isolinux_cfg"; then
|
||||
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "$isolinux_append"
|
||||
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
|
||||
echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
|
||||
else
|
||||
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
||||
@@ -1112,15 +1211,7 @@ echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
|
||||
# Sync builder config into variant work dir, preserving lb cache.
|
||||
rsync -a --delete \
|
||||
--exclude='cache/' \
|
||||
--exclude='chroot/' \
|
||||
--exclude='.build/' \
|
||||
--exclude='*.iso' \
|
||||
--exclude='*.packages' \
|
||||
--exclude='*.contents' \
|
||||
--exclude='*.files' \
|
||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
||||
sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"
|
||||
|
||||
# Share deb package cache across variants.
|
||||
# Restore: populate work dir cache from shared cache before build.
|
||||
@@ -1411,8 +1502,11 @@ dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||
echo "=== enforcing canonical bootloader assets ==="
|
||||
enforce_live_build_bootloader_assets "${LB_DIR}"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
||||
run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
|
||||
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "rm -f '${LB_DIR}/live-image-amd64.hybrid.iso' && lb binary_iso 2>&1"
|
||||
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
|
||||
run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
|
||||
|
||||
# --- persist deb package cache back to shared location ---
|
||||
@@ -1438,6 +1532,7 @@ if [ -f "$ISO_RAW" ]; then
|
||||
fi
|
||||
fi
|
||||
validate_iso_memtest "$ISO_RAW"
|
||||
validate_iso_live_boot_entries "$ISO_RAW"
|
||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
|
||||
@@ -23,9 +23,9 @@ insmod serial
|
||||
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
||||
|
||||
insmod gfxterm
|
||||
insmod png
|
||||
|
||||
source /boot/grub/theme.cfg
|
||||
|
||||
terminal_input console serial
|
||||
terminal_output gfxterm serial
|
||||
|
||||
insmod png
|
||||
source /boot/grub/theme.cfg
|
||||
|
||||
@@ -1,47 +1,16 @@
|
||||
source /boot/grub/config.cfg
|
||||
|
||||
echo ""
|
||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||
echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — load to RAM (toram)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — KMS + GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — fail-safe" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
menuentry "EASY-BEE -- load to RAM (toram)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 70 KiB After Width: | Height: | Size: 78 KiB |
@@ -5,12 +5,10 @@ title-text: ""
|
||||
message-font: "Unifont Regular 16"
|
||||
terminal-font: "Unifont Regular 16"
|
||||
|
||||
#bee logo — centered, upper third of screen
|
||||
#bee logo - centered, upper third of screen
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
width = 400
|
||||
height = 400
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
|
||||
@@ -36,11 +34,11 @@ terminal-font: "Unifont Regular 16"
|
||||
item_font = "Unifont Regular 16"
|
||||
selected_item_color= "#f5a800"
|
||||
selected_item_font = "Unifont Regular 16"
|
||||
item_height = 16
|
||||
item_padding = 0
|
||||
item_height = 20
|
||||
item_padding = 2
|
||||
item_spacing = 4
|
||||
icon_width = 0
|
||||
icon_heigh = 0
|
||||
icon_height = 0
|
||||
item_icon_space = 0
|
||||
}
|
||||
|
||||
|
||||
@@ -60,35 +60,129 @@ wait_for_process_exit() {
|
||||
return 0
|
||||
}
|
||||
|
||||
kill_pattern() {
|
||||
pattern="$1"
|
||||
if pgrep -f "$pattern" >/dev/null 2>&1; then
|
||||
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
||||
log_pid_details() {
|
||||
pid="$1"
|
||||
line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
|
||||
if [ -n "$line" ]; then
|
||||
log_blocker "$line"
|
||||
else
|
||||
log_blocker "pid $pid"
|
||||
fi
|
||||
}
|
||||
|
||||
collect_gpu_compute_pids() {
|
||||
index="$1"
|
||||
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
nvidia-smi --id="$index" \
|
||||
--query-compute-apps=pid \
|
||||
--format=csv,noheader,nounits 2>/dev/null \
|
||||
| sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
|
||||
| grep -E '^[0-9]+$' || true
|
||||
}
|
||||
|
||||
collect_gpu_device_pids() {
|
||||
index="$1"
|
||||
dev="/dev/nvidia$index"
|
||||
[ -e "$dev" ] || return 0
|
||||
if command -v fuser >/dev/null 2>&1; then
|
||||
fuser "$dev" 2>/dev/null \
|
||||
| tr ' ' '\n' \
|
||||
| sed 's/[^0-9].*$//' \
|
||||
| grep -E '^[0-9]+$' || true
|
||||
fi
|
||||
}
|
||||
|
||||
collect_gpu_holder_pids() {
|
||||
index="$1"
|
||||
{
|
||||
collect_gpu_compute_pids "$index"
|
||||
collect_gpu_device_pids "$index"
|
||||
} | awk 'NF' | sort -u
|
||||
}
|
||||
|
||||
kill_pid_list() {
|
||||
pids="$1"
|
||||
[ -n "$pids" ] || return 0
|
||||
|
||||
for pid in $pids; do
|
||||
log_pid_details "$pid"
|
||||
done
|
||||
log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
|
||||
for pid in $pids; do
|
||||
kill -TERM "$pid" >/dev/null 2>&1 || true
|
||||
done
|
||||
sleep 1
|
||||
for pid in $pids; do
|
||||
if kill -0 "$pid" >/dev/null 2>&1; then
|
||||
log "forcing GPU holder PID $pid to exit"
|
||||
kill -KILL "$pid" >/dev/null 2>&1 || true
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
gpu_has_display_holders() {
|
||||
index="$1"
|
||||
holders=$(collect_gpu_device_pids "$index")
|
||||
[ -n "$holders" ] || return 1
|
||||
for pid in $holders; do
|
||||
comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
||||
case "$comm" in
|
||||
Xorg|Xwayland|X|gnome-shell)
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_nv_hostengine_if_running() {
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||
[ -n "$line" ] || continue
|
||||
log_blocker "$line"
|
||||
done
|
||||
log "killing processes matching: $pattern"
|
||||
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
|
||||
sleep 1
|
||||
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
|
||||
log "stopping nv-hostengine"
|
||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||
hostengine_was_active=1
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_fabricmanager_if_active() {
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_display_stack_if_active() {
|
||||
stopped=1
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
stopped=0
|
||||
fi
|
||||
done
|
||||
return "$stopped"
|
||||
}
|
||||
|
||||
try_gpu_reset() {
|
||||
index="$1"
|
||||
log "resetting GPU $index"
|
||||
nvidia-smi -r -i "$index"
|
||||
}
|
||||
|
||||
drain_gpu_clients() {
|
||||
display_was_active=0
|
||||
fabric_was_active=0
|
||||
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
fi
|
||||
done
|
||||
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
fi
|
||||
hostengine_was_active=0
|
||||
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||
@@ -98,21 +192,25 @@ drain_gpu_clients() {
|
||||
log "stopping nv-hostengine"
|
||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||
hostengine_was_active=1
|
||||
fi
|
||||
|
||||
for pattern in \
|
||||
"nvidia-smi" \
|
||||
"dcgmi" \
|
||||
"nvvs" \
|
||||
"dcgmproftester" \
|
||||
"all_reduce_perf" \
|
||||
"nvtop" \
|
||||
"bee-gpu-burn" \
|
||||
"bee-john-gpu-stress" \
|
||||
"bee-nccl-gpu-stress" \
|
||||
"Xorg" \
|
||||
"Xwayland"; do
|
||||
kill_pattern "$pattern"
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
fi
|
||||
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
fi
|
||||
done
|
||||
|
||||
for dev in /dev/nvidia[0-9]*; do
|
||||
[ -e "$dev" ] || continue
|
||||
holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
|
||||
kill_pid_list "$holders"
|
||||
done
|
||||
}
|
||||
|
||||
@@ -125,7 +223,7 @@ restore_gpu_clients() {
|
||||
fi
|
||||
fi
|
||||
|
||||
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
log "starting nv-hostengine"
|
||||
nv-hostengine
|
||||
fi
|
||||
@@ -153,10 +251,60 @@ restart_drivers() {
|
||||
|
||||
reset_gpu() {
|
||||
index="$1"
|
||||
drain_gpu_clients
|
||||
log "resetting GPU $index"
|
||||
nvidia-smi -r -i "$index"
|
||||
display_was_active=0
|
||||
fabric_was_active=0
|
||||
hostengine_was_active=0
|
||||
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
stop_nv_hostengine_if_running || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
stop_fabricmanager_if_active || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
if gpu_has_display_holders "$index"; then
|
||||
stop_display_stack_if_active || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
log "GPU $index still has holders after targeted drain"
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
try_gpu_reset "$index"
|
||||
rc=$?
|
||||
restore_gpu_clients
|
||||
return "$rc"
|
||||
}
|
||||
|
||||
cmd="${1:-}"
|
||||
|
||||
@@ -47,6 +47,13 @@ echo "==> Сборка бинарника..."
|
||||
)
|
||||
echo " OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
|
||||
|
||||
LOCAL_SHA="$(shasum -a 256 "${LOCAL_BIN}" | awk '{print $1}')"
|
||||
REMOTE_SHA="$("${SSH_CMD[@]}" "$REMOTE" "if [ -f '${REMOTE_BIN}' ] && command -v sha256sum >/dev/null 2>&1; then sha256sum '${REMOTE_BIN}' | awk '{print \\$1}'; fi" 2>/dev/null || true)"
|
||||
if [[ -n "${REMOTE_SHA}" && "${LOCAL_SHA}" == "${REMOTE_SHA}" ]]; then
|
||||
echo "==> Бинарник не изменился (${LOCAL_SHA}); копирование и перезапуск сервисов пропущены."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- Deploy ---
|
||||
echo "==> Копирование на ${REMOTE}..."
|
||||
"${SCP_CMD[@]}" "${LOCAL_BIN}" "${REMOTE}:/tmp/bee-new"
|
||||
|
||||
Reference in New Issue
Block a user