Compare commits

...

10 Commits
v8.17 ... v8.24

Author SHA1 Message Date
5ba72ab315 Add rsync to initramfs for toram progress output
live-boot already uses rsync --progress when /bin/rsync exists; without
it the copy falls back to silent cp -a. Add rsync to the ISO package
list and install an initramfs-tools hook (bee-rsync) that copies the
rsync binary + shared libs into the initrd via copy_exec. The hook then
rebuilds the initramfs so the change takes effect in the ISO's initrd.img.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-17 23:52:47 +03:00
63363e9629 Add toram boot entry and Install to RAM resume support
- grub.cfg: add "load to RAM (toram)" entry to advanced submenu
- install_to_ram.go: resume from existing /dev/shm/bee-live copy if
  source medium is unavailable after bee-web restart
- tasks.go: fix "Recovered after bee-web restart" shown on every run
  (check j.lines before first append, not after)
- bee-install: retry unsquashfs up to 5x with wait-for-remount on
  source loss; clear error message with bee-remount-medium hint
- bee-remount-medium: new script to find and remount live ISO source
  after USB/CD reconnect; supports --wait polling mode
- 9000-bee-setup: chmod +x for bee-install and bee-remount-medium

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-17 23:48:56 +03:00
Mikhail Chusavitin
5285c0d101 Capture per-run IPMI power and GPU telemetry in power benchmark
- Sample IPMI loaded_w per single-card calibration and per ramp step
  instead of averaging over the entire Phase 2; top-level ServerPower
  uses the final (all-GPU) ramp step value
- Add ServerLoadedW/ServerDeltaW to NvidiaPowerBenchGPU and
  NvidiaPowerBenchStep so external tooling can compare wall power per
  phase without re-parsing logs
- Write gpu-metrics.csv/.html inside each single-XX/ and step-XX/
  subdir; aggregate all phases into a top-level gpu-metrics.csv/.html
- Write 00-nvidia-smi-q.log at the start of every power run
- Add Telemetry (p95 temp/power/fan/clock) to NvidiaPowerBenchGPU in
  result.json from the converged calibration attempt
- Power benchmark page: split "Achieved W" into Single-card W and
  Multi-GPU W (StablePowerLimitW); derate highlight and status color
  now reflect the final multi-GPU limit vs nominal
- Performance benchmark page: add Status column and per-GPU score
  color coding (green/yellow/red) based on gpu.Status and OverallStatus

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-17 17:59:58 +03:00
Mikhail Chusavitin
dca4afb8d0 Seed power ramp with single-card TDP limits 2026-04-16 11:43:01 +03:00
Mikhail Chusavitin
b4280941f5 Move NCCL and NVBandwidth into validate mode 2026-04-16 11:02:30 +03:00
Mikhail Chusavitin
f74976ec4c Use static overlay wallpaper in ISO build 2026-04-16 10:54:03 +03:00
Mikhail Chusavitin
18e24a9aa5 Estimate fan duty from observed RPM maxima 2026-04-16 10:10:18 +03:00
Mikhail Chusavitin
e306250da7 Disable fp64/fp4 in mixed gpu burn 2026-04-16 10:00:03 +03:00
Mikhail Chusavitin
c5b2081ac9 Disable unstable fp4/fp64 benchmark phases 2026-04-16 09:58:02 +03:00
434528083e Power bench: compare GPU-reported TDP vs IPMI server power delta
- NvidiaPowerBenchResult gains ServerPower *BenchmarkServerPower
- RunNvidiaPowerBench samples IPMI idle before Phase 1 and loaded via
  background goroutine throughout Phase 2 ramp
- renderPowerBenchReport: new "Server vs GPU Power Comparison" table
  with ratio annotation (✓ match / ⚠ minor / ✗ over-report)
- renderPowerBenchSummary: server_idle_w, server_loaded_w, server_delta_w,
  server_reporting_ratio keys

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 07:21:02 +03:00
23 changed files with 1085 additions and 322 deletions

View File

@@ -146,7 +146,7 @@ type satRunner interface {
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
}
type runtimeChecker interface {
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
body := "Results: " + path
if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error()

View File

@@ -128,6 +128,7 @@ type fakeSAT struct {
runNvidiaPowerFn func(string, int, []int) (string, error)
runNvidiaPulseFn func(string, int, []int) (string, error)
runNvidiaBandwidthFn func(string, []int) (string, error)
runNCCLFn func(string, []int) (string, error)
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
return "", nil
}
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNCCLFn != nil {
return f.runNCCLFn(baseDir, gpuIndices)
}
return "", nil
}
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
t.Parallel()
var gotBaseDir string
var gotGPUIndices []int
a := &App{
sat: fakeSAT{
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
gotBaseDir = baseDir
gotGPUIndices = append([]int(nil), gpuIndices...)
return "/tmp/nccl-tests.tar.gz", nil
},
},
}
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
if err != nil {
t.Fatalf("RunNCCLTests error: %v", err)
}
if path != "/tmp/nccl-tests.tar.gz" {
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
}
if gotBaseDir != "/tmp/sat" {
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
}
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
}
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel()

View File

@@ -59,6 +59,9 @@ type benchmarkPowerCalibrationResult struct {
// ≥20% while server fans were below 100% duty cycle — a signal that the
// cooling system may not be correctly configured for full GPU load.
CoolingWarning string
// MetricRows holds the telemetry rows from the final (converged) attempt
// for this GPU. Used to build per-run gpu-metrics.csv.
MetricRows []GPUMetricRow
}
type benchmarkBurnProfile struct {
@@ -94,9 +97,13 @@ var (
)
// benchmarkPrecisionPhases lists the precision categories run as individual
// steady-state windows before the combined steady pass. Order is from lowest
// steady-state windows before the combined steady pass. Order is from lowest
// to highest power draw so thermal ramp-up is gradual.
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
//
// fp64 and fp4 are intentionally disabled for now: both are currently unstable
// on the target fleet and can abort the mixed steady stage after the earlier
// phases already collected useful telemetry.
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32"}
func computeCapabilityCode(raw string) int {
raw = strings.TrimSpace(raw)
@@ -124,6 +131,15 @@ func benchmarkSupportedPrecisions(computeCapability string) []string {
return out
}
func benchmarkPrecisionEnabled(category string) bool {
switch category {
case "int8", "fp8", "fp16", "fp16_bf16", "fp32", "fp32_tf32":
return true
default:
return false
}
}
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
if len(precisions) == 0 {
precisions = append([]string(nil), benchmarkPrecisionPhases...)
@@ -514,6 +530,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
}
applyBenchmarkSteadyFallback(&gpuResult)
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" {
@@ -1108,6 +1125,7 @@ type benchmarkCoolingSample struct {
AvgFanRPM float64
AvgFanDutyCyclePct float64
FanDutyCycleAvailable bool
FanDutyCycleEstimated bool
}
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
@@ -1120,6 +1138,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
samples[i].FanAvgRPM = fanSample.AvgFanRPM
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
}
return samples, nil
}
@@ -1127,11 +1146,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
fans, _ := sampleFanSpeeds()
avgRPM, _, _ := fanRPMStats(fans)
dutyPct, dutyAvailable := sampleFanDutyCyclePct()
dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
return benchmarkCoolingSample{
AvgFanRPM: avgRPM,
AvgFanDutyCyclePct: dutyPct,
FanDutyCycleAvailable: dutyAvailable,
FanDutyCycleEstimated: dutyEstimated,
}
}
@@ -1373,44 +1393,91 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
}
var rpmValues []float64
var dutyValues []float64
var dutyEstimated bool
for _, row := range rows {
if row.FanAvgRPM > 0 {
rpmValues = append(rpmValues, row.FanAvgRPM)
}
if row.FanDutyCycleAvailable {
dutyValues = append(dutyValues, row.FanDutyCyclePct)
if row.FanDutyCycleEstimated {
dutyEstimated = true
}
}
}
if len(rpmValues) == 0 && len(dutyValues) == 0 {
return nil
}
summary := &BenchmarkCoolingSummary{
Available: true,
AvgFanRPM: benchmarkMean(rpmValues),
Available: true,
AvgFanRPM: benchmarkMean(rpmValues),
FanDutyCycleEstimated: dutyEstimated,
}
if len(dutyValues) > 0 {
summary.FanDutyCycleAvailable = true
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
if summary.FanDutyCycleEstimated {
summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
}
} else {
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
}
return summary
}
func benchmarkTelemetryAvailable(summary BenchmarkTelemetrySummary) bool {
return summary.Samples > 0 || summary.DurationSec > 0
}
func benchmarkPrecisionSteadyFallback(phases []BenchmarkPrecisionSteadyPhase) (BenchmarkTelemetrySummary, string, bool) {
var (
best BenchmarkTelemetrySummary
bestLabel string
found bool
)
for _, phase := range phases {
if !benchmarkTelemetryAvailable(phase.Steady) {
continue
}
if !found ||
phase.Steady.DurationSec > best.DurationSec ||
(phase.Steady.DurationSec == best.DurationSec && phase.Steady.P95PowerW > best.P95PowerW) {
best = phase.Steady
bestLabel = phase.Precision
found = true
}
}
return best, bestLabel, found
}
func applyBenchmarkSteadyFallback(gpu *BenchmarkGPUResult) {
if gpu == nil || benchmarkTelemetryAvailable(gpu.Steady) {
return
}
if fallback, label, ok := benchmarkPrecisionSteadyFallback(gpu.PrecisionSteady); ok {
gpu.Steady = fallback
gpu.Notes = append(gpu.Notes,
fmt.Sprintf("mixed steady telemetry unavailable; reporting steady-state fallback from %s precision phase", label))
}
}
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
score := BenchmarkScorecard{}
// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
// Each precision ran alone with full GPU dedicated — peak capability.
for _, p := range gpu.PrecisionSteady {
if !benchmarkPrecisionEnabled(p.Precision) {
continue
}
score.SyntheticScore += p.WeightedTeraOpsPerSec
}
// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
// All precisions compete simultaneously — closer to real inference workloads.
for _, p := range gpu.PrecisionResults {
if p.Supported {
if p.Supported && benchmarkPrecisionEnabled(p.Category) {
score.MixedScore += p.WeightedTeraOpsPerSec
}
}
@@ -1441,10 +1508,17 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
// so CV reflects genuine power regulation, not workload switching).
if len(gpu.PrecisionSteady) > 0 {
var sum float64
var count int
for _, p := range gpu.PrecisionSteady {
if !benchmarkPrecisionEnabled(p.Precision) {
continue
}
sum += clampScore(100 - p.Steady.PowerCVPct*3)
count++
}
if count > 0 {
score.PowerSustainScore = sum / float64(count)
}
score.PowerSustainScore = sum / float64(len(gpu.PrecisionSteady))
} else if gpu.Steady.PowerCVPct > 0 {
score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3)
}
@@ -2512,6 +2586,7 @@ func runNvidiaBenchmarkParallel(
// Score and finalize each GPU.
for _, idx := range selected {
r := gpuResults[idx]
applyBenchmarkSteadyFallback(r)
r.Scores = scoreBenchmarkGPUResult(*r)
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
pr := parseResults[idx]
@@ -2694,19 +2769,22 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
return cl
}
// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
// throttle counters. If a GPU starts throttling, the current targeted_power run
// is canceled immediately, the power limit is reduced, and a fresh full cycle
// is started again from the beginning. The selected reduced power limit stays
// active for the main benchmark and is restored by the caller afterwards.
// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
// actively watches throttle counters. seedLimits, when provided, are treated as
// the starting point for this calibration pass rather than as immutable fixed
// limits. This matters during cumulative ramp-up: once an additional GPU is
// introduced, every already-active GPU must be revalidated under the new
// thermal state instead of assuming its previous single-step limit is still
// valid. The selected reduced power limits stay active for the main benchmark
// and are restored by the caller afterwards.
func runBenchmarkPowerCalibration(
ctx context.Context,
verboseLog, runDir string,
gpuIndices []int,
infoByIndex map[int]benchmarkGPUInfo,
logFunc func(string),
fixedLimits map[int]int,
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
seedLimits map[int]int,
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
const calibDurationSec = 120
const maxDerateW = 150
// calibSearchTolerance is the binary-search convergence threshold in watts.
@@ -2720,7 +2798,7 @@ func runBenchmarkPowerCalibration(
if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
}
if killed := KillTestWorkers(); len(killed) > 0 {
for _, p := range killed {
@@ -2739,7 +2817,6 @@ func runBenchmarkPowerCalibration(
err error
}
// gpuCalibState holds per-GPU binary search state during parallel calibration.
type gpuCalibState struct {
idx int
@@ -2755,6 +2832,8 @@ func runBenchmarkPowerCalibration(
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
var restore []benchmarkRestoreAction
var allCalibRows []GPUMetricRow // accumulated telemetry across all attempts
var calibCursor float64
// Initialise per-GPU state.
states := make([]*gpuCalibState, 0, len(gpuIndices))
@@ -2796,19 +2875,20 @@ func runBenchmarkPowerCalibration(
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
}
if fixedLimits != nil {
if fixedW, ok := fixedLimits[idx]; ok {
// This GPU's limit was established in a prior ramp step and must
// remain unchanged. Apply it immediately and skip the binary search.
if canDerate && fixedW > 0 {
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, fixedW)
if seedLimits != nil {
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
// A previously validated limit is only a starting point. Re-run
// targeted_power under the current multi-GPU thermal load and derate
// again if this step shows new throttling.
if canDerate {
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
}
s.appliedLimitW = fixedW
s.calib.AppliedPowerLimitW = float64(fixedW)
s.calib.Completed = true
s.converged = true
s.appliedLimitW = seedW
s.hi = seedW + 1
s.calib.AppliedPowerLimitW = float64(seedW)
s.calib.Derated = seedW < s.originalLimitW
s.calib.Notes = append(s.calib.Notes,
fmt.Sprintf("fixed limit: %d W (held from prior ramp step)", fixedW))
fmt.Sprintf("seed limit: %d W (revalidating under current thermal load)", seedW))
}
}
states = append(states, s)
@@ -2906,6 +2986,8 @@ calibDone:
ticker.Stop()
cancelAttempt()
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
// Accumulate telemetry rows with attempt stage label.
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
// Resource busy: retry with exponential back-off (shared — one DCGM session).
if ar.err != nil && isDCGMResourceBusy(ar.err) {
@@ -2990,6 +3072,7 @@ calibDone:
}
}
}
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
s.converged = true
continue
}
@@ -3028,6 +3111,7 @@ calibDone:
} else {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
}
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
s.converged = true
continue
}
@@ -3065,7 +3149,8 @@ calibDone:
results[s.idx] = s.calib
}
}
return results, restore
writeBenchmarkMetricsFiles(runDir, allCalibRows)
return results, restore, allCalibRows
}
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3091,7 +3176,6 @@ func powerBenchDurationSec(profile string) int {
}
}
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
out := make(map[int]benchmarkGPUInfo, len(src))
for k, v := range src {
@@ -3107,7 +3191,42 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
fmt.Fprintf(&b, "**Platform max TDP:** %.0f W \n\n", result.PlatformMaxTDPW)
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W \n", sp.DeltaW)
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n", sp.ReportingRatio)
}
b.WriteString("\n")
// Server power comparison table.
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server vs GPU Power Comparison\n\n")
b.WriteString("| Metric | Value |\n")
b.WriteString("|--------|-------|\n")
fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
if sp.Available {
fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
fmt.Fprintf(&b, "| Server Δ power (loaded idle) | %.0f W |\n", sp.DeltaW)
ratio := sp.ReportingRatio
ratioNote := ""
switch {
case ratio >= 0.9:
ratioNote = "✓ GPU telemetry matches server power"
case ratio >= 0.75:
ratioNote = "⚠ minor discrepancy — GPU may slightly over-report TDP"
default:
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
}
fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
} else {
b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
}
for _, note := range sp.Notes {
fmt.Fprintf(&b, "\n> %s\n", note)
}
b.WriteString("\n")
}
if len(result.Findings) > 0 {
b.WriteString("## Summary\n\n")
for _, finding := range result.Findings {
@@ -3121,21 +3240,25 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
if len(result.RampSteps) > 0 {
b.WriteString("## Ramp Sequence\n\n")
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
for _, step := range result.RampSteps {
derated := "-"
if step.Derated {
derated = "⚠ yes"
}
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
serverDelta := "-"
if step.ServerDeltaW > 0 {
serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
}
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
}
b.WriteString("\n")
}
b.WriteString("## Per-Slot Results\n\n")
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
for _, gpu := range result.GPUs {
stableLimit := "-"
if gpu.StablePowerLimitW > 0 {
@@ -3145,8 +3268,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
}
}
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
serverDelta := "-"
if gpu.ServerDeltaW > 0 {
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
}
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
}
b.WriteString("\n")
for _, gpu := range result.GPUs {
@@ -3175,11 +3302,25 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
if step.ServerLoadedW > 0 {
fmt.Fprintf(&b, "ramp_step_%d_server_loaded_w=%.0f\n", step.StepIndex, step.ServerLoadedW)
fmt.Fprintf(&b, "ramp_step_%d_server_delta_w=%.0f\n", step.StepIndex, step.ServerDeltaW)
}
}
for _, gpu := range result.GPUs {
if gpu.StablePowerLimitW > 0 {
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
}
if gpu.ServerLoadedW > 0 {
fmt.Fprintf(&b, "gpu_%d_server_loaded_w=%.0f\n", gpu.Index, gpu.ServerLoadedW)
fmt.Fprintf(&b, "gpu_%d_server_delta_w=%.0f\n", gpu.Index, gpu.ServerDeltaW)
}
}
if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
fmt.Fprintf(&b, "server_loaded_w=%.0f\n", sp.LoadedW)
fmt.Fprintf(&b, "server_delta_w=%.0f\n", sp.DeltaW)
fmt.Fprintf(&b, "server_reporting_ratio=%.2f\n", sp.ReportingRatio)
}
return b.String()
}
@@ -3212,6 +3353,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
if infoErr != nil {
return "", infoErr
}
// Capture full nvidia-smi -q snapshot at the start of the run.
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
hostname, _ := os.Hostname()
result := NvidiaPowerBenchResult{
BenchmarkVersion: benchmarkVersion,
@@ -3224,16 +3369,44 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
// Sample IPMI idle power before any GPU load.
var serverIdleW float64
var serverIdleOK bool
if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
singleIPMILoadedW := make(map[int]float64, len(selected))
var allRestoreActions []benchmarkRestoreAction
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
var allPowerRows []GPUMetricRow
var powerCursor float64
for _, idx := range selected {
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
_ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
ipmiSingleDone := make(chan float64, 1)
go func() {
defer close(ipmiSingleDone)
if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
ipmiSingleDone <- w
}
}()
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
ipmiSingleCancel()
if w, ok := <-ipmiSingleDone; ok {
singleIPMILoadedW[idx] = w
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W", idx, w))
}
allRestoreActions = append(allRestoreActions, restore...)
if r, ok := c[idx]; ok {
calibByIndex[idx] = r
@@ -3258,7 +3431,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.OverallStatus = "PARTIAL"
}
}
gpus = append(gpus, NvidiaPowerBenchGPU{
gpu := NvidiaPowerBenchGPU{
Index: idx,
Name: info.Name,
BusID: info.BusID,
@@ -3271,7 +3444,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
Status: status,
Notes: append([]string(nil), calib.Notes...),
CoolingWarning: calib.CoolingWarning,
})
}
if w, ok := singleIPMILoadedW[idx]; ok && serverIdleOK && w > 0 {
gpu.ServerLoadedW = w
gpu.ServerDeltaW = w - serverIdleW
}
if len(calib.MetricRows) > 0 {
t := summarizeBenchmarkTelemetry(calib.MetricRows)
gpu.Telemetry = &t
}
gpus = append(gpus, gpu)
}
sort.Slice(gpus, func(i, j int) bool {
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
@@ -3320,20 +3502,30 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
// serverLoadedW tracks the IPMI server power from the final ramp step
// (all GPUs simultaneously loaded). Earlier steps' values are stored
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
var serverLoadedW float64
var serverLoadedOK bool
// Step 1: reuse single-card calibration result directly.
if len(result.RecommendedSlotOrder) > 0 {
firstIdx := result.RecommendedSlotOrder[0]
firstCalib := calibByIndex[firstIdx]
stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW))
ramp := NvidiaPowerBenchStep{
StepIndex: 1,
GPUIndices: []int{firstIdx},
NewGPUIndex: firstIdx,
NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
StepIndex: 1,
GPUIndices: []int{firstIdx},
NewGPUIndex: firstIdx,
NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
TotalObservedPowerW: firstCalib.Summary.P95PowerW,
AvgObservedPowerW: firstCalib.Summary.P95PowerW,
Derated: firstCalib.Derated,
Status: "OK",
Derated: firstCalib.Derated,
Status: "OK",
}
if w, ok := singleIPMILoadedW[firstIdx]; ok && serverIdleOK && w > 0 {
ramp.ServerLoadedW = w
ramp.ServerDeltaW = w - serverIdleW
}
if !firstCalib.Completed {
ramp.Status = "FAILED"
@@ -3351,8 +3543,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW))
}
// Steps 2..N: each step fixes previously calibrated GPUs and searches only
// the new GPU's stable limit in the combined thermal environment.
// Steps 2..N: each step revalidates every already-active GPU under the new
// cumulative thermal environment and also calibrates the newly introduced
// GPU. Previously found limits are used only as seeds for the search.
for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ {
step := stepNum + 1
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
@@ -3360,17 +3553,46 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
_ = os.MkdirAll(stepDir, 0755)
// All previously calibrated GPUs are fixed at their stable limits.
fixedForStep := make(map[int]int, len(stableLimits))
for k, v := range stableLimits {
fixedForStep[k] = v
// Reuse the latest stable limits as starting points, but re-check every
// active GPU in this hotter configuration. For the newly introduced GPU,
// seed from its single-card calibration so we do not restart from the
// default TDP when a prior derated limit is already known.
seedForStep := make(map[int]int, len(subset))
for _, idx := range subset {
if lim, ok := stableLimits[idx]; ok && lim > 0 {
seedForStep[idx] = lim
continue
}
if base, ok := calibByIndex[idx]; ok {
lim := int(math.Round(base.AppliedPowerLimitW))
if lim > 0 {
seedForStep[idx] = lim
}
}
}
logFunc(fmt.Sprintf("power ramp: step %d/%d — calibrating GPU %d with %d fixed GPU(s)",
step, len(result.RecommendedSlotOrder), newGPUIdx, len(fixedForStep)))
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, fixedForStep)
ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
ipmiStepDone := make(chan float64, 1)
go func() {
defer close(ipmiStepDone)
if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
ipmiStepDone <- w
}
}()
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
ipmiStepCancel()
var stepIPMILoadedW float64
var stepIPMIOK bool
if w, ok := <-ipmiStepDone; ok {
stepIPMILoadedW = w
stepIPMIOK = true
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
}
// Accumulate restore actions; they all run in the outer defer.
allRestoreActions = append(allRestoreActions, stepRestore...)
@@ -3391,26 +3613,56 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
}
// Determine stable limit for the new GPU.
if c, ok := stepCalib[newGPUIdx]; ok && c.Completed {
stableLimits[newGPUIdx] = int(math.Round(c.AppliedPowerLimitW))
ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
ramp.Derated = c.Derated
for _, idx := range subset {
c, ok := stepCalib[idx]
if !ok || !c.Completed {
fallback := 0
if lim, ok := stableLimits[idx]; ok && lim > 0 {
fallback = lim
} else if fb, ok := calibByIndex[idx]; ok {
fallback = int(math.Round(fb.AppliedPowerLimitW))
}
if fallback > 0 {
stableLimits[idx] = fallback
}
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes,
fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
result.OverallStatus = "PARTIAL"
continue
}
prevLimit, hadPrev := stableLimits[idx]
newLimit := int(math.Round(c.AppliedPowerLimitW))
stableLimits[idx] = newLimit
if idx == newGPUIdx {
ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
ramp.Derated = c.Derated
}
if c.Derated {
ramp.Status = "PARTIAL"
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
}
} else {
// Calibration failed — fall back to single-card limit.
fb := calibByIndex[newGPUIdx]
stableLimits[newGPUIdx] = int(math.Round(fb.AppliedPowerLimitW))
ramp.NewGPUStableLimitW = fb.AppliedPowerLimitW
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W", newGPUIdx, step, fb.AppliedPowerLimitW))
result.OverallStatus = "PARTIAL"
if hadPrev && newLimit < prevLimit {
ramp.Notes = append(ramp.Notes,
fmt.Sprintf("GPU %d was re-derated from %d W to %d W under combined thermal load.", idx, prevLimit, newLimit))
}
}
if c, ok := stepCalib[newGPUIdx]; ok && c.Completed && c.Derated {
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
}
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
ramp.ServerLoadedW = stepIPMILoadedW
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
// The last step has all GPUs loaded — use it as the top-level loaded_w.
if step == len(result.RecommendedSlotOrder) {
serverLoadedW = stepIPMILoadedW
serverLoadedOK = true
}
}
result.RampSteps = append(result.RampSteps, ramp)
@@ -3421,6 +3673,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
result.GPUs[i].StablePowerLimitW = float64(lim)
}
if result.GPUs[i].StablePowerLimitW > 0 && result.GPUs[i].AppliedPowerLimitW > 0 &&
result.GPUs[i].StablePowerLimitW < result.GPUs[i].AppliedPowerLimitW {
result.GPUs[i].Derated = true
result.Findings = append(result.Findings, fmt.Sprintf(
"GPU %d required additional derating from %.0f W (single-card) to %.0f W under full-system thermal load.",
result.GPUs[i].Index, result.GPUs[i].AppliedPowerLimitW, result.GPUs[i].StablePowerLimitW,
))
}
}
// PlatformMaxTDPW = sum of all stable limits — the actual sustained power
@@ -3428,6 +3688,15 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
for _, lim := range stableLimits {
result.PlatformMaxTDPW += float64(lim)
}
// Characterize server power from IPMI idle/loaded samples.
// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
// ReportingRatio = IPMI_delta / GPU_reported_sum:
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
_ = serverIdleOK // used implicitly via characterizeServerPower
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
// Write top-level gpu-metrics.csv/.html aggregating all phases.
writeBenchmarkMetricsFiles(runDir, allPowerRows)
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("marshal power result: %w", err)

View File

@@ -261,14 +261,18 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
b.WriteString("\n")
// Steady-state telemetry
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
b.WriteString("\n")
if benchmarkTelemetryAvailable(gpu.Steady) {
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
b.WriteString("\n")
} else {
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
}
// Per-precision stability phases.
if len(gpu.PrecisionSteady) > 0 {

View File

@@ -49,8 +49,8 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
benchmarkPrecisionPhases,
func(label string) string { return label },
)
if len(labels) != 7 || len(phases) != 7 {
t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
if len(labels) != 5 || len(phases) != 5 {
t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
}
if basePhaseSec != 60 {
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
@@ -61,7 +61,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
}
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
@@ -80,7 +80,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
if mixedPhaseSec != 3600 {
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
@@ -99,7 +99,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
if mixedPhaseSec != 14400 {
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
@@ -133,10 +133,10 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
t.Parallel()
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
t.Fatalf("supported=%v", got)
}
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
t.Fatalf("supported=%v", got)
}
}
@@ -314,6 +314,30 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
}
}
func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
t.Parallel()
score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
{Precision: "fp16", WeightedTeraOpsPerSec: 100},
{Precision: "fp64", WeightedTeraOpsPerSec: 999},
{Precision: "fp4", WeightedTeraOpsPerSec: 999},
},
PrecisionResults: []BenchmarkPrecisionResult{
{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
},
})
if score.SyntheticScore != 100 {
t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
}
if score.MixedScore != 50 {
t.Fatalf("MixedScore=%f want 50", score.MixedScore)
}
}
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
t.Parallel()

View File

@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
Available bool `json:"available"`
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
Notes []string `json:"notes,omitempty"`
@@ -55,32 +56,32 @@ type NvidiaBenchmarkOptions struct {
}
type NvidiaBenchmarkResult struct {
BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"`
ServerModel string `json:"server_model,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"`
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
RampStep int `json:"ramp_step,omitempty"`
RampTotal int `json:"ramp_total,omitempty"`
RampRunID string `json:"ramp_run_id,omitempty"`
ScalabilityScore float64 `json:"scalability_score,omitempty"`
BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"`
ServerModel string `json:"server_model,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"`
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
RampStep int `json:"ramp_step,omitempty"`
RampTotal int `json:"ramp_total,omitempty"`
RampRunID string `json:"ramp_run_id,omitempty"`
ScalabilityScore float64 `json:"scalability_score,omitempty"`
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
// 100% = each added GPU contributes exactly its single-card throughput.
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
OverallStatus string `json:"overall_status"`
SelectedGPUIndices []int `json:"selected_gpu_indices"`
Findings []string `json:"findings,omitempty"`
Warnings []string `json:"warnings,omitempty"`
Normalization BenchmarkNormalization `json:"normalization"`
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
OverallStatus string `json:"overall_status"`
SelectedGPUIndices []int `json:"selected_gpu_indices"`
Findings []string `json:"findings,omitempty"`
Warnings []string `json:"warnings,omitempty"`
Normalization BenchmarkNormalization `json:"normalization"`
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
}
type BenchmarkNormalization struct {
@@ -223,8 +224,8 @@ type BenchmarkScorecard struct {
// Throttle breakdown — percentage of steady-state time in each throttle type.
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
// Temperature headroom: distance to the 100°C destruction threshold.
@@ -300,18 +301,22 @@ type NvidiaPowerBenchResult struct {
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
// cumulative thermal ramp. Represents the actual sustained power budget of
// this server under full GPU load. Use for rack power planning.
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
// ServerPower captures IPMI server power delta (idle→loaded) measured in
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
// actual wall-power draw as seen by the server's power supply.
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
}
type NvidiaPowerBenchGPU struct {
Index int `json:"index"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
Index int `json:"index"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// AppliedPowerLimitW is the stable limit found during single-card calibration.
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
// StablePowerLimitW is the final fixed limit for this GPU after the
// cumulative thermal ramp. This is the limit at which the GPU operated
// stably with all other GPUs running simultaneously at their own limits.
@@ -326,13 +331,20 @@ type NvidiaPowerBenchGPU struct {
Notes []string `json:"notes,omitempty"`
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
CoolingWarning string `json:"cooling_warning,omitempty"`
// ServerLoadedW is the IPMI server power reading captured during this
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW idle.
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
// Telemetry holds the aggregated stats from the final converged calibration
// attempt for this GPU (temperature, power, fan, clock percentiles).
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
}
type NvidiaPowerBenchStep struct {
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
// NewGPUIndex is the GPU whose stable limit was searched in this step.
NewGPUIndex int `json:"new_gpu_index"`
NewGPUIndex int `json:"new_gpu_index"`
// NewGPUStableLimitW is the stable power limit found for the new GPU.
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
@@ -340,20 +352,24 @@ type NvidiaPowerBenchStep struct {
Derated bool `json:"derated,omitempty"`
Status string `json:"status"`
Notes []string `json:"notes,omitempty"`
// ServerLoadedW is the IPMI server power reading captured during this
// ramp step's calibration run. ServerDeltaW = ServerLoadedW idle.
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
}
// NvidiaPerformanceRampStep holds per-step performance data for the
// scalability ramp-up phase of the performance benchmark.
type NvidiaPerformanceRampStep struct {
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
// TOPS from dedicated single-precision phases) across all GPUs in this step.
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
ScalabilityPct float64 `json:"scalability_pct"`
Status string `json:"status"`
Notes []string `json:"notes,omitempty"`
ScalabilityPct float64 `json:"scalability_pct"`
Status string `json:"status"`
Notes []string `json:"notes,omitempty"`
}

View File

@@ -27,6 +27,7 @@ type GPUMetricRow struct {
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
}
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
for _, r := range rows {
dutyAvail := 0
if r.FanDutyCycleAvailable {
dutyAvail = 1
}
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
dutyEstimated := 0
if r.FanDutyCycleEstimated {
dutyEstimated = 1
}
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
}
return os.WriteFile(path, b.Bytes(), 0644)
}

View File

@@ -140,26 +140,56 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
}
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
if err != nil || len(squashfsFiles) == 0 {
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
}
free := freeMemBytes()
var needed int64
for _, sf := range squashfsFiles {
fi, err2 := os.Stat(sf)
if err2 != nil {
return fmt.Errorf("stat %s: %v", sf, err2)
}
needed += fi.Size()
}
const headroom = 256 * 1024 * 1024
if free > 0 && needed+headroom > free {
return fmt.Errorf("insufficient RAM: need %s, available %s",
humanBytes(needed+headroom), humanBytes(free))
}
sourceAvailable := err == nil && len(squashfsFiles) > 0
dstDir := installToRAMDir
// If the source medium is unavailable, check whether a previous run already
// produced a complete copy in RAM. If so, skip the copy phase and proceed
// directly to the loop-rebind / bind-mount steps.
if !sourceAvailable {
copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
if len(copiedFiles) > 0 {
log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
// Proceed to rebind with the already-copied files.
for _, dst := range copiedFiles {
base := filepath.Base(dst)
// Re-associate the loop device that was originally backed by the
// source file (now gone); find it by the old source path pattern.
srcGuess := "/run/live/medium/live/" + base
loopDev, lerr := findLoopForFile(srcGuess)
if lerr != nil {
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
continue
}
if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
} else {
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
}
}
goto bindMedium
}
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
}
{
free := freeMemBytes()
var needed int64
for _, sf := range squashfsFiles {
fi, err2 := os.Stat(sf)
if err2 != nil {
return fmt.Errorf("stat %s: %v", sf, err2)
}
needed += fi.Size()
}
const headroom = 256 * 1024 * 1024
if free > 0 && needed+headroom > free {
return fmt.Errorf("insufficient RAM: need %s, available %s",
humanBytes(needed+headroom), humanBytes(free))
}
}
if state.CopyPresent {
log("Removing stale partial RAM copy before retry...")
}
@@ -199,6 +229,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
}
}
bindMedium:
log("Copying remaining medium files...")
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
log(fmt.Sprintf("Warning: partial copy: %v", err))

View File

@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
return string(raw), err
}
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
// detect GPU count
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
gpuCount := len(selected)
if gpuCount < 1 {
gpuCount = 1
}
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
satJob{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20",
}},
}, env: nvidiaVisibleDevicesEnv(selected)},
), logFunc)
}

View File

@@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"os"
"os/exec"
"path/filepath"
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
UpdatedAt time.Time
}
type fanObservationState struct {
MaxRPM map[string]float64 `json:"max_rpm"`
}
type fanPeakCandidate struct {
FirstSeen time.Time
RPM float64
}
var (
systemPowerCacheMu sync.Mutex
systemPowerCache cachedPowerReading
fanObservationMu sync.Mutex
fanObservation fanObservationState
fanObservationInit bool
fanPeakCandidates = make(map[string]fanPeakCandidate)
)
const systemPowerHoldTTL = 15 * time.Second
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
const fanObservationMinPeakHold = time.Second
func normalizeObservedFanMaxRPM(rpm float64) float64 {
if rpm <= 0 {
return 0
}
return math.Ceil(rpm/1000.0) * 1000.0
}
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
if err == nil {
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
updateFanObservation(fans, time.Now())
return fans, nil
}
}
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
if len(fans) > 0 {
updateFanObservation(fans, time.Now())
return fans, nil
}
if err != nil {
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
return nil, sensorsErr
}
func loadFanObservationLocked() {
if fanObservationInit {
return
}
fanObservationInit = true
fanObservation.MaxRPM = make(map[string]float64)
raw, err := os.ReadFile(fanObservationStatePath)
if err != nil || len(raw) == 0 {
return
}
var persisted fanObservationState
if json.Unmarshal(raw, &persisted) != nil {
return
}
for name, rpm := range persisted.MaxRPM {
name = strings.TrimSpace(name)
if name == "" || rpm <= 0 {
continue
}
fanObservation.MaxRPM[name] = rpm
}
}
func saveFanObservationLocked() {
if len(fanObservation.MaxRPM) == 0 {
return
}
dir := filepath.Dir(fanObservationStatePath)
if dir == "" || dir == "." {
dir = "/var/log/bee-sat"
}
if err := os.MkdirAll(dir, 0755); err != nil {
return
}
raw, err := json.MarshalIndent(fanObservation, "", " ")
if err != nil {
return
}
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
}
func updateFanObservation(fans []FanReading, now time.Time) {
if len(fans) == 0 {
return
}
fanObservationMu.Lock()
defer fanObservationMu.Unlock()
loadFanObservationLocked()
changed := false
for _, fan := range fans {
name := strings.TrimSpace(fan.Name)
if name == "" || fan.RPM <= 0 {
continue
}
currentMax := fanObservation.MaxRPM[name]
if fan.RPM <= currentMax {
delete(fanPeakCandidates, name)
continue
}
if cand, ok := fanPeakCandidates[name]; ok {
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
newMax := math.Max(cand.RPM, fan.RPM)
if newMax > currentMax {
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
changed = true
}
delete(fanPeakCandidates, name)
continue
}
if fan.RPM > cand.RPM {
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
}
continue
}
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
}
if changed {
saveFanObservationLocked()
}
}
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
if len(fans) == 0 {
return 0, false
}
fanObservationMu.Lock()
defer fanObservationMu.Unlock()
loadFanObservationLocked()
var samples []float64
for _, fan := range fans {
name := strings.TrimSpace(fan.Name)
if name == "" || fan.RPM <= 0 {
continue
}
maxRPM := fanObservation.MaxRPM[name]
if maxRPM <= 0 {
continue
}
pct := fan.RPM / maxRPM * 100.0
if pct > 100 {
pct = 100
}
if pct < 0 {
pct = 0
}
samples = append(samples, pct)
}
if len(samples) == 0 {
return 0, false
}
return benchmarkMean(samples), true
}
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
// Handles two formats:
//
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
// Returns the average duty cycle across all exposed PWM controls.
func sampleFanDutyCyclePct() (float64, bool) {
func sampleFanDutyCyclePct() (float64, bool, bool) {
out, err := exec.Command("sensors", "-j").Output()
if err != nil || len(out) == 0 {
return 0, false
fans, fanErr := sampleFanSpeeds()
if fanErr != nil {
return 0, false, false
}
return sampleFanDutyCyclePctFromFans(fans)
}
return parseFanDutyCyclePctSensorsJSON(out)
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
return pct, ok, false
}
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
if len(fans) == 0 {
return 0, false, false
}
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
return pct, true, true
}
return 0, false, false
}
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {

View File

@@ -1,6 +1,7 @@
package platform
import (
"path/filepath"
"testing"
"time"
)
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
}
}
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
t.Parallel()
oldPath := fanObservationStatePath
oldState := fanObservation
oldInit := fanObservationInit
oldCandidates := fanPeakCandidates
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
fanObservation = fanObservationState{}
fanObservationInit = false
fanPeakCandidates = make(map[string]fanPeakCandidate)
t.Cleanup(func() {
fanObservationStatePath = oldPath
fanObservation = oldState
fanObservationInit = oldInit
fanPeakCandidates = oldCandidates
})
start := time.Unix(100, 0)
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
t.Fatalf("single-sample spike should not establish observed max")
}
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
if !ok {
t.Fatalf("expected estimated duty cycle from persisted observed max")
}
if got < 43 || got > 44 {
t.Fatalf("got=%v want ~43.3", got)
}
fanObservation = fanObservationState{}
fanObservationInit = false
fanPeakCandidates = make(map[string]fanPeakCandidate)
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
if !ok {
t.Fatalf("expected persisted observed max to be reloaded from disk")
}
if got < 43 || got > 44 {
t.Fatalf("reloaded got=%v want ~43.3", got)
}
}
func TestParseDCMIPowerReading(t *testing.T) {
raw := `
Instantaneous power reading: 512 Watts

View File

@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
}
}
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 2 {

View File

@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
)) +
`</div>` +
`<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
)) +
`</div>` +
`</div>
@@ -1527,8 +1527,6 @@ function satModeChanged() {
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
].forEach(function(item) {
const card = document.getElementById(item.card);
if (card) {
@@ -1776,7 +1774,7 @@ function runAllSAT() {
const cycles = 1;
const status = document.getElementById('sat-all-status');
status.textContent = 'Enqueuing...';
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const activeTargets = baseTargets.filter(target => {
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -2016,9 +2014,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
// ── Benchmark ─────────────────────────────────────────────────────────────────
type benchmarkHistoryRun struct {
generatedAt time.Time
displayTime string
gpuScores map[int]float64 // GPU index → composite score
generatedAt time.Time
displayTime string
gpuScores map[int]float64 // GPU index → composite score
gpuStatuses map[int]string // GPU index → status ("OK", "WARNING", "FAILED", …)
overallStatus string
}
func renderBenchmark(opts HandlerOptions) string {
@@ -2082,7 +2082,7 @@ func renderBenchmark(opts HandlerOptions) string {
</div>
</div>
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2326,7 +2326,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
}
b.WriteString(`<div style="overflow-x:auto">`)
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
for i := 0; i <= maxGPUIndex; i++ {
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
}
@@ -2335,13 +2335,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
b.WriteString(`<tr>`)
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
overallColor := "var(--ok)"
overallLabel := run.overallStatus
if overallLabel == "" {
overallLabel = "OK"
}
if overallLabel == "FAILED" {
overallColor = "var(--crit-fg,#9f3a38)"
} else if overallLabel != "OK" {
overallColor = "var(--warn)"
}
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
for idx := 0; idx <= maxGPUIndex; idx++ {
score, ok := run.gpuScores[idx]
if !ok {
b.WriteString(`<td style="color:var(--muted)">-</td>`)
continue
}
b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
gpuStatus := run.gpuStatuses[idx]
scoreColor := ""
switch gpuStatus {
case "FAILED":
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
case "WARNING", "PARTIAL":
scoreColor = ` style="color:var(--warn);font-weight:600"`
case "", "OK":
// no override
default:
scoreColor = ` style="color:var(--warn);font-weight:600"`
}
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
}
b.WriteString(`</tr>`)
}
@@ -2375,12 +2398,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
continue
}
run := benchmarkHistoryRun{
generatedAt: result.GeneratedAt,
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
gpuScores: make(map[int]float64),
generatedAt: result.GeneratedAt,
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
gpuScores: make(map[int]float64),
gpuStatuses: make(map[int]string),
overallStatus: result.OverallStatus,
}
for _, gpu := range result.GPUs {
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
run.gpuStatuses[gpu.Index] = gpu.Status
if gpu.Index > maxGPUIndex {
maxGPUIndex = gpu.Index
}
@@ -2449,31 +2475,45 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
if len(latest.GPUs) > 0 {
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
b.WriteString(`</tr></thead><tbody>`)
for _, gpu := range latest.GPUs {
derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
// finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp,
// falling back to single-card applied limit if the ramp hasn't run.
finalLimitW := gpu.StablePowerLimitW
if finalLimitW <= 0 {
finalLimitW = gpu.AppliedPowerLimitW
}
// Derate is relative to nominal (DefaultPowerLimitW), using the final limit.
derated := gpu.Derated ||
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
rowStyle := ""
achievedStyle := ""
finalStyle := ""
if derated {
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
achievedStyle = ` style="color:#e6a000;font-weight:600"`
finalStyle = ` style="color:#e6a000;font-weight:600"`
}
statusLabel := gpu.Status
if statusLabel == "" {
statusLabel = "OK"
}
statusColor := "var(--ok)"
if statusLabel != "OK" {
if statusLabel == "FAILED" {
statusColor = "var(--crit-fg,#9f3a38)"
} else if statusLabel != "OK" {
statusColor = "var(--warn)"
}
nominalStr := "-"
if gpu.DefaultPowerLimitW > 0 {
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
}
achievedStr := "-"
singleStr := "-"
if gpu.AppliedPowerLimitW > 0 {
achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
}
multiStr := "-"
if gpu.StablePowerLimitW > 0 {
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
}
p95Str := "-"
if gpu.MaxObservedPowerW > 0 {
@@ -2483,7 +2523,8 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
b.WriteString(`<td>` + nominalStr + `</td>`)
b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
b.WriteString(`<td>` + singleStr + `</td>`)
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
b.WriteString(`<td>` + p95Str + `</td>`)
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
b.WriteString(`</tr>`)
@@ -2517,7 +2558,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
func renderBurn() string {
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px">

View File

@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
}
}
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA Interconnect (NCCL)`,
`Runs in Validate and Stress.`,
`NVIDIA Bandwidth (NVBandwidth)`,
`Intended to stay short enough for Validate.`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()

View File

@@ -613,8 +613,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
}
a := q.opts.App
recovered := len(j.lines) > 0
j.append(fmt.Sprintf("Starting %s...", t.Name))
if len(j.lines) > 0 {
if recovered {
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
}
@@ -736,15 +737,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
case "nvidia-stress":
if a == nil {
err = fmt.Errorf("app not configured")

View File

@@ -713,6 +713,19 @@ static const struct profile_desc k_profiles[] = {
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
if (!(desc->enabled && cc >= desc->min_cc)) {
return 0;
}
if (precision_filter != NULL) {
return strcmp(desc->block_label, precision_filter) == 0;
}
/* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
* unstable on the current benchmark fleet and can abort the whole mixed
* pass after earlier phases already collected useful telemetry. */
return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
}
static int load_cublaslt(struct cublaslt_api *api) {
memset(api, 0, sizeof(*api));
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -1222,8 +1235,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
/* Count profiles matching the filter (for deciding what to run). */
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
(precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
planned++;
}
}
@@ -1240,7 +1252,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
* profiles matching precision_filter. */
int planned_total = 0;
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
planned_total++;
}
}
@@ -1310,10 +1322,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
desc->min_cc);
continue;
}
if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
if (!profile_allowed_for_run(desc, cc, precision_filter)) {
append_detail(report->details,
sizeof(report->details),
"%s=SKIPPED precision_filter\n",
"%s=SKIPPED benchmark_disabled\n",
desc->name);
continue;
}

View File

@@ -16,6 +16,11 @@ menuentry "EASY-BEE" {
}
submenu "EASY-BEE (advanced options) -->" {
menuentry "EASY-BEE — load to RAM (toram)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE — GSP=off" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd @INITRD_LIVE@

View File

@@ -63,8 +63,10 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
chmod +x /usr/local/bin/bee 2>/dev/null || true
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
if [ "$GPU_VENDOR" = "nvidia" ]; then
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true

View File

@@ -1,117 +0,0 @@
#!/bin/sh
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
set -e
echo "=== generating bee wallpaper ==="
mkdir -p /usr/share/bee
python3 - <<'PYEOF'
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import os
W, H = 1920, 1080
ASCII_ART = [
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
]
SUBTITLE = " Hardware Audit LiveCD"
FG = (0xF6, 0xD0, 0x47)
FG_DIM = (0xD4, 0xA9, 0x1C)
SHADOW = (0x5E, 0x47, 0x05)
SUB = (0x96, 0x7A, 0x17)
BG = (0x05, 0x05, 0x05)
MONO_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
]
SUB_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
]
def load_font(candidates, size):
for path in candidates:
if os.path.exists(path):
return ImageFont.truetype(path, size)
return ImageFont.load_default()
def mono_metrics(font):
probe = Image.new('L', (W, H), 0)
draw = ImageDraw.Draw(probe)
char_w = int(round(draw.textlength("M", font=font)))
bb = draw.textbbox((0, 0), "Mg", font=font)
char_h = bb[3] - bb[1]
return char_w, char_h
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
width = max(len(line) for line in lines) * char_w
height = len(lines) * char_h + line_gap * (len(lines) - 1)
mask = Image.new('L', (width, height), 0)
draw = ImageDraw.Draw(mask)
for row, line in enumerate(lines):
y = row * (char_h + line_gap)
for col, ch in enumerate(line):
if ch == ' ':
continue
x = col * char_w
draw.text((x, y), ch, font=font, fill=255)
return mask
img = Image.new('RGB', (W, H), BG)
draw = ImageDraw.Draw(img)
# Soft amber glow under the logo without depending on font rendering.
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
glow_draw = ImageDraw.Draw(glow)
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
glow = glow.filter(ImageFilter.GaussianBlur(60))
img = Image.alpha_composite(img.convert('RGBA'), glow)
TARGET_LOGO_W = 400
max_chars = max(len(line) for line in ASCII_ART)
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
_probe_cw, _ = mono_metrics(_probe_font)
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
char_w, char_h = mono_metrics(font_logo)
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
logo_w, logo_h = logo_mask.size
logo_x = (W - logo_w) // 2
logo_y = 380
sh_off = max(1, font_size_logo // 6)
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
img.paste(FG, (logo_x, logo_y), logo_mask)
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
sub_y = logo_y + logo_h + 48
draw = ImageDraw.Draw(img)
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
img = img.convert('RGB')
img.save('/usr/share/bee/wallpaper.png', optimize=True)
print('wallpaper written: /usr/share/bee/wallpaper.png')
PYEOF
echo "=== wallpaper done ==="

View File

@@ -0,0 +1,46 @@
#!/bin/sh
# 9011-toram-rsync.hook.chroot
#
# Adds rsync to the initramfs so that live-boot's toram code takes the
# rsync --progress path instead of the silent "cp -a" fallback.
#
# live-boot's 9990-toram-todisk.sh already contains:
# if [ -x /bin/rsync ]; then
# rsync -a --progress ... 1>/dev/console
# else
# cp -a ... # no output
# fi
#
# We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
# which copies the binary + all shared-library dependencies into the initrd.
set -e
HOOK_DIR="/etc/initramfs-tools/hooks"
HOOK="${HOOK_DIR}/bee-rsync"
mkdir -p "${HOOK_DIR}"
cat > "${HOOK}" << 'EOF'
#!/bin/sh
# initramfs hook: include rsync for live-boot toram progress output
PREREQ=""
prereqs() { echo "$PREREQ"; }
case "$1" in prereqs) prereqs; exit 0 ;; esac
. /usr/share/initramfs-tools/hook-functions
if [ -x /usr/bin/rsync ]; then
copy_exec /usr/bin/rsync /bin
fi
EOF
chmod +x "${HOOK}"
echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
# Rebuild initramfs so the hook takes effect in the ISO's initrd.img
KVER=$(ls /lib/modules | sort -V | tail -1)
echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
update-initramfs -u -k "${KVER}"
echo "9011-toram-rsync: done"

View File

@@ -3,6 +3,7 @@ dmidecode
smartmontools
nvme-cli
pciutils
rsync
ipmitool
util-linux
e2fsprogs

View File

@@ -65,6 +65,9 @@ done
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
if [ ! -f "$SQUASHFS" ]; then
echo "ERROR: squashfs not found at $SQUASHFS" >&2
echo " The live medium may have been disconnected." >&2
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
echo " Then re-run bee-install." >&2
exit 1
fi
@@ -162,10 +165,59 @@ log " Mounted."
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
log " Source: $SQUASHFS"
log " Target: $MOUNT_ROOT"
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
grep -E '^\[|^inod|^created|^extract' | \
while read -r line; do log " $line"; done || true
log " Unpack complete."
# unsquashfs does not support resume, so retry the entire unpack step if the
# source medium disappears mid-copy (e.g. CD physically disconnected).
UNPACK_ATTEMPTS=0
UNPACK_MAX=5
while true; do
UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
fi
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
# Re-check squashfs is reachable before each attempt
if [ ! -f "$SQUASHFS" ]; then
log " SOURCE LOST: $SQUASHFS not found."
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
log " then press Enter here to retry."
read -r _
continue
fi
# wipe partial unpack so unsquashfs starts clean
if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
log " Cleaning partial unpack from $MOUNT_ROOT ..."
# keep the mount point itself but remove its contents
find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
fi
UNPACK_OK=0
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
# Check squashfs is still reachable (gone = disc pulled during copy)
if [ ! -f "$SQUASHFS" ]; then
log " WARNING: source medium lost during unpack — will retry after remount."
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
read -r _
continue
fi
# Verify the unpack produced a usable root (presence of /etc is a basic check)
if [ -d "${MOUNT_ROOT}/etc" ]; then
log " Unpack complete."
break
else
log " WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
log " Retrying in 5 s ..."
sleep 5
fi
fi
done
# ------------------------------------------------------------------
log "--- Step 6/7: Configuring installed system ---"

View File

@@ -0,0 +1,100 @@
#!/bin/bash
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
#
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
# was lost and /run/live/medium/live/filesystem.squashfs is missing.
#
# Usage: bee-remount-medium [--wait]
# --wait keep retrying every 5 seconds until the medium is found (useful
# while physically reconnecting the device)
set -euo pipefail
MEDIUM_DIR="/run/live/medium"
SQUASHFS_REL="live/filesystem.squashfs"
WAIT_MODE=0
for arg in "$@"; do
case "$arg" in
--wait|-w) WAIT_MODE=1 ;;
--help|-h)
echo "Usage: bee-remount-medium [--wait]"
echo " Finds and remounts the live ISO medium to $MEDIUM_DIR"
echo " --wait retry every 5 s until a medium with squashfs is found"
exit 0 ;;
esac
done
log() { echo "[$(date +%H:%M:%S)] $*"; }
die() { log "ERROR: $*" >&2; exit 1; }
# Return all candidate block devices (optical + removable USB mass storage)
find_candidates() {
# CD/DVD drives
for dev in /dev/sr* /dev/scd*; do
[ -b "$dev" ] && echo "$dev"
done
# USB/removable disks and partitions
for dev in /dev/sd* /dev/vd*; do
[ -b "$dev" ] || continue
# Only whole disks or partitions — skip the same device we are running from
local removable
local base
base=$(basename "$dev")
removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
[ "$removable" = "1" ] && echo "$dev"
done
}
# Try to mount $1 to $MEDIUM_DIR and check for squashfs
try_mount() {
local dev="$1"
local tmpdir
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
# Unmount probe mount and mount properly onto live path
umount "$tmpdir" 2>/dev/null || true
rmdir "$tmpdir" 2>/dev/null || true
# Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
umount "$MEDIUM_DIR" 2>/dev/null || true
mkdir -p "$MEDIUM_DIR"
if mount -o ro "$dev" "$MEDIUM_DIR"; then
log "Mounted $dev on $MEDIUM_DIR"
return 0
else
log "Mount of $dev on $MEDIUM_DIR failed"
return 1
fi
fi
umount "$tmpdir" 2>/dev/null || true
fi
rmdir "$tmpdir" 2>/dev/null || true
return 1
}
attempt() {
log "Scanning for ISO medium..."
for dev in $(find_candidates); do
log " Trying $dev ..."
if try_mount "$dev"; then
local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
return 0
fi
done
return 1
}
if [ "$WAIT_MODE" = "1" ]; then
log "Waiting for live medium (press Ctrl+C to abort)..."
while true; do
if attempt; then
exit 0
fi
log " Not found — retrying in 5 s (reconnect the disc now)"
sleep 5
done
else
attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
fi