Compare commits

...

4 Commits
v8.18 ... v8.22

Author SHA1 Message Date
Mikhail Chusavitin
dca4afb8d0 Seed power ramp with single-card TDP limits 2026-04-16 11:43:01 +03:00
Mikhail Chusavitin
b4280941f5 Move NCCL and NVBandwidth into validate mode 2026-04-16 11:02:30 +03:00
Mikhail Chusavitin
f74976ec4c Use static overlay wallpaper in ISO build 2026-04-16 10:54:03 +03:00
Mikhail Chusavitin
18e24a9aa5 Estimate fan duty from observed RPM maxima 2026-04-16 10:10:18 +03:00
13 changed files with 379 additions and 199 deletions

View File

@@ -146,7 +146,7 @@ type satRunner interface {
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
} }
type runtimeChecker interface { type runtimeChecker interface {
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc) return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
} }
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) { func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil) path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
body := "Results: " + path body := "Results: " + path
if err != nil && err != context.Canceled { if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error() body += "\nERROR: " + err.Error()

View File

@@ -128,6 +128,7 @@ type fakeSAT struct {
runNvidiaPowerFn func(string, int, []int) (string, error) runNvidiaPowerFn func(string, int, []int) (string, error)
runNvidiaPulseFn func(string, int, []int) (string, error) runNvidiaPulseFn func(string, int, []int) (string, error)
runNvidiaBandwidthFn func(string, []int) (string, error) runNvidiaBandwidthFn func(string, []int) (string, error)
runNCCLFn func(string, []int) (string, error)
runNvidiaTargetedStressFn func(string, int, []int) (string, error) runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runMemoryFn func(string) (string, error) runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error) runStorageFn func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
return "", nil return "", nil
} }
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) { func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNCCLFn != nil {
return f.runNCCLFn(baseDir, gpuIndices)
}
return "", nil return "", nil
} }
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
t.Parallel()
var gotBaseDir string
var gotGPUIndices []int
a := &App{
sat: fakeSAT{
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
gotBaseDir = baseDir
gotGPUIndices = append([]int(nil), gpuIndices...)
return "/tmp/nccl-tests.tar.gz", nil
},
},
}
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
if err != nil {
t.Fatalf("RunNCCLTests error: %v", err)
}
if path != "/tmp/nccl-tests.tar.gz" {
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
}
if gotBaseDir != "/tmp/sat" {
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
}
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
}
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -1122,6 +1122,7 @@ type benchmarkCoolingSample struct {
AvgFanRPM float64 AvgFanRPM float64
AvgFanDutyCyclePct float64 AvgFanDutyCyclePct float64
FanDutyCycleAvailable bool FanDutyCycleAvailable bool
FanDutyCycleEstimated bool
} }
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) { func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
@@ -1134,6 +1135,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
samples[i].FanAvgRPM = fanSample.AvgFanRPM samples[i].FanAvgRPM = fanSample.AvgFanRPM
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
} }
return samples, nil return samples, nil
} }
@@ -1141,11 +1143,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
func sampleBenchmarkCoolingSample() benchmarkCoolingSample { func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
fans, _ := sampleFanSpeeds() fans, _ := sampleFanSpeeds()
avgRPM, _, _ := fanRPMStats(fans) avgRPM, _, _ := fanRPMStats(fans)
dutyPct, dutyAvailable := sampleFanDutyCyclePct() dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
return benchmarkCoolingSample{ return benchmarkCoolingSample{
AvgFanRPM: avgRPM, AvgFanRPM: avgRPM,
AvgFanDutyCyclePct: dutyPct, AvgFanDutyCyclePct: dutyPct,
FanDutyCycleAvailable: dutyAvailable, FanDutyCycleAvailable: dutyAvailable,
FanDutyCycleEstimated: dutyEstimated,
} }
} }
@@ -1387,25 +1390,33 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
} }
var rpmValues []float64 var rpmValues []float64
var dutyValues []float64 var dutyValues []float64
var dutyEstimated bool
for _, row := range rows { for _, row := range rows {
if row.FanAvgRPM > 0 { if row.FanAvgRPM > 0 {
rpmValues = append(rpmValues, row.FanAvgRPM) rpmValues = append(rpmValues, row.FanAvgRPM)
} }
if row.FanDutyCycleAvailable { if row.FanDutyCycleAvailable {
dutyValues = append(dutyValues, row.FanDutyCyclePct) dutyValues = append(dutyValues, row.FanDutyCyclePct)
if row.FanDutyCycleEstimated {
dutyEstimated = true
}
} }
} }
if len(rpmValues) == 0 && len(dutyValues) == 0 { if len(rpmValues) == 0 && len(dutyValues) == 0 {
return nil return nil
} }
summary := &BenchmarkCoolingSummary{ summary := &BenchmarkCoolingSummary{
Available: true, Available: true,
AvgFanRPM: benchmarkMean(rpmValues), AvgFanRPM: benchmarkMean(rpmValues),
FanDutyCycleEstimated: dutyEstimated,
} }
if len(dutyValues) > 0 { if len(dutyValues) > 0 {
summary.FanDutyCycleAvailable = true summary.FanDutyCycleAvailable = true
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues) summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95) summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
if summary.FanDutyCycleEstimated {
summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
}
} else { } else {
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected") summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
} }
@@ -3491,10 +3502,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
_ = os.MkdirAll(stepDir, 0755) _ = os.MkdirAll(stepDir, 0755)
// Reuse the latest stable limits as starting points, but re-check every // Reuse the latest stable limits as starting points, but re-check every
// active GPU in this hotter configuration. // active GPU in this hotter configuration. For the newly introduced GPU,
seedForStep := make(map[int]int, len(stableLimits)) // seed from its single-card calibration so we do not restart from the
for k, v := range stableLimits { // default TDP when a prior derated limit is already known.
seedForStep[k] = v seedForStep := make(map[int]int, len(subset))
for _, idx := range subset {
if lim, ok := stableLimits[idx]; ok && lim > 0 {
seedForStep[idx] = lim
continue
}
if base, ok := calibByIndex[idx]; ok {
lim := int(math.Round(base.AppliedPowerLimitW))
if lim > 0 {
seedForStep[idx] = lim
}
}
} }
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d", logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",

View File

@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
Available bool `json:"available"` Available bool `json:"available"`
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"` AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"` AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"` P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
Notes []string `json:"notes,omitempty"` Notes []string `json:"notes,omitempty"`
@@ -55,32 +56,32 @@ type NvidiaBenchmarkOptions struct {
} }
type NvidiaBenchmarkResult struct { type NvidiaBenchmarkResult struct {
BenchmarkVersion string `json:"benchmark_version"` BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"` GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"` Hostname string `json:"hostname,omitempty"`
ServerModel string `json:"server_model,omitempty"` ServerModel string `json:"server_model,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"` BenchmarkProfile string `json:"benchmark_profile"`
ParallelGPUs bool `json:"parallel_gpus,omitempty"` ParallelGPUs bool `json:"parallel_gpus,omitempty"`
RampStep int `json:"ramp_step,omitempty"` RampStep int `json:"ramp_step,omitempty"`
RampTotal int `json:"ramp_total,omitempty"` RampTotal int `json:"ramp_total,omitempty"`
RampRunID string `json:"ramp_run_id,omitempty"` RampRunID string `json:"ramp_run_id,omitempty"`
ScalabilityScore float64 `json:"scalability_score,omitempty"` ScalabilityScore float64 `json:"scalability_score,omitempty"`
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N. // PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
// 100% = each added GPU contributes exactly its single-card throughput. // 100% = each added GPU contributes exactly its single-card throughput.
// < 100% = throughput loss due to thermal throttle, power limits, or contention. // < 100% = throughput loss due to thermal throttle, power limits, or contention.
PlatformPowerScore float64 `json:"platform_power_score,omitempty"` PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"` PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
OverallStatus string `json:"overall_status"` OverallStatus string `json:"overall_status"`
SelectedGPUIndices []int `json:"selected_gpu_indices"` SelectedGPUIndices []int `json:"selected_gpu_indices"`
Findings []string `json:"findings,omitempty"` Findings []string `json:"findings,omitempty"`
Warnings []string `json:"warnings,omitempty"` Warnings []string `json:"warnings,omitempty"`
Normalization BenchmarkNormalization `json:"normalization"` Normalization BenchmarkNormalization `json:"normalization"`
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"` HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"` CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"` Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
GPUs []BenchmarkGPUResult `json:"gpus"` GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"` Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
} }
type BenchmarkNormalization struct { type BenchmarkNormalization struct {
@@ -223,8 +224,8 @@ type BenchmarkScorecard struct {
// Throttle breakdown — percentage of steady-state time in each throttle type. // Throttle breakdown — percentage of steady-state time in each throttle type.
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did. // Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"` SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
// Temperature headroom: distance to the 100°C destruction threshold. // Temperature headroom: distance to the 100°C destruction threshold.
@@ -300,22 +301,22 @@ type NvidiaPowerBenchResult struct {
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the // PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
// cumulative thermal ramp. Represents the actual sustained power budget of // cumulative thermal ramp. Represents the actual sustained power budget of
// this server under full GPU load. Use for rack power planning. // this server under full GPU load. Use for rack power planning.
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"` PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
// ServerPower captures IPMI server power delta (idle→loaded) measured in // ServerPower captures IPMI server power delta (idle→loaded) measured in
// parallel with the thermal ramp. Use to compare GPU-reported TDP against // parallel with the thermal ramp. Use to compare GPU-reported TDP against
// actual wall-power draw as seen by the server's power supply. // actual wall-power draw as seen by the server's power supply.
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
Findings []string `json:"findings,omitempty"` Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"` GPUs []NvidiaPowerBenchGPU `json:"gpus"`
} }
type NvidiaPowerBenchGPU struct { type NvidiaPowerBenchGPU struct {
Index int `json:"index"` Index int `json:"index"`
Name string `json:"name,omitempty"` Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"` BusID string `json:"bus_id,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// AppliedPowerLimitW is the stable limit found during single-card calibration. // AppliedPowerLimitW is the stable limit found during single-card calibration.
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"` AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
// StablePowerLimitW is the final fixed limit for this GPU after the // StablePowerLimitW is the final fixed limit for this GPU after the
// cumulative thermal ramp. This is the limit at which the GPU operated // cumulative thermal ramp. This is the limit at which the GPU operated
// stably with all other GPUs running simultaneously at their own limits. // stably with all other GPUs running simultaneously at their own limits.
@@ -333,10 +334,10 @@ type NvidiaPowerBenchGPU struct {
} }
type NvidiaPowerBenchStep struct { type NvidiaPowerBenchStep struct {
StepIndex int `json:"step_index"` StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"` GPUIndices []int `json:"gpu_indices"`
// NewGPUIndex is the GPU whose stable limit was searched in this step. // NewGPUIndex is the GPU whose stable limit was searched in this step.
NewGPUIndex int `json:"new_gpu_index"` NewGPUIndex int `json:"new_gpu_index"`
// NewGPUStableLimitW is the stable power limit found for the new GPU. // NewGPUStableLimitW is the stable power limit found for the new GPU.
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"` NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"` TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
@@ -349,15 +350,15 @@ type NvidiaPowerBenchStep struct {
// NvidiaPerformanceRampStep holds per-step performance data for the // NvidiaPerformanceRampStep holds per-step performance data for the
// scalability ramp-up phase of the performance benchmark. // scalability ramp-up phase of the performance benchmark.
type NvidiaPerformanceRampStep struct { type NvidiaPerformanceRampStep struct {
StepIndex int `json:"step_index"` StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"` GPUIndices []int `json:"gpu_indices"`
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent // TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
// TOPS from dedicated single-precision phases) across all GPUs in this step. // TOPS from dedicated single-precision phases) across all GPUs in this step.
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"` TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"` TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100. // ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss. // 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
ScalabilityPct float64 `json:"scalability_pct"` ScalabilityPct float64 `json:"scalability_pct"`
Status string `json:"status"` Status string `json:"status"`
Notes []string `json:"notes,omitempty"` Notes []string `json:"notes,omitempty"`
} }

View File

@@ -27,6 +27,7 @@ type GPUMetricRow struct {
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"` FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"` FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
} }
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// WriteGPUMetricsCSV writes collected rows as a CSV file. // WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer var b bytes.Buffer
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n") b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
for _, r := range rows { for _, r := range rows {
dutyAvail := 0 dutyAvail := 0
if r.FanDutyCycleAvailable { if r.FanDutyCycleAvailable {
dutyAvail = 1 dutyAvail = 1
} }
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n", dutyEstimated := 0
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail) if r.FanDutyCycleEstimated {
dutyEstimated = 1
}
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
} }
return os.WriteFile(path, b.Bytes(), 0644) return os.WriteFile(path, b.Bytes(), 0644)
} }

View File

@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
return string(raw), err return string(raw), err
} }
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs. // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe. // Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
// detect GPU count selected, err := resolveDCGMGPUIndices(gpuIndices)
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output() if err != nil {
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n")) return "", err
}
gpuCount := len(selected)
if gpuCount < 1 { if gpuCount < 1 {
gpuCount = 1 gpuCount = 1
} }
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
satJob{name: "02-all-reduce-perf.log", cmd: []string{ satJob{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20", "-g", strconv.Itoa(gpuCount), "--iters", "20",
}}, }, env: nvidiaVisibleDevicesEnv(selected)},
), logFunc) ), logFunc)
} }

View File

@@ -4,6 +4,7 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"math"
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
UpdatedAt time.Time UpdatedAt time.Time
} }
type fanObservationState struct {
MaxRPM map[string]float64 `json:"max_rpm"`
}
type fanPeakCandidate struct {
FirstSeen time.Time
RPM float64
}
var ( var (
systemPowerCacheMu sync.Mutex systemPowerCacheMu sync.Mutex
systemPowerCache cachedPowerReading systemPowerCache cachedPowerReading
fanObservationMu sync.Mutex
fanObservation fanObservationState
fanObservationInit bool
fanPeakCandidates = make(map[string]fanPeakCandidate)
) )
const systemPowerHoldTTL = 15 * time.Second const systemPowerHoldTTL = 15 * time.Second
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
const fanObservationMinPeakHold = time.Second
func normalizeObservedFanMaxRPM(rpm float64) float64 {
if rpm <= 0 {
return 0
}
return math.Ceil(rpm/1000.0) * 1000.0
}
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds, // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv. // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling. // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output() out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
if err == nil { if err == nil {
if fans := parseFanSpeeds(string(out)); len(fans) > 0 { if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
updateFanObservation(fans, time.Now())
return fans, nil return fans, nil
} }
} }
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON() fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
if len(fans) > 0 { if len(fans) > 0 {
updateFanObservation(fans, time.Now())
return fans, nil return fans, nil
} }
if err != nil { if err != nil {
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
return nil, sensorsErr return nil, sensorsErr
} }
func loadFanObservationLocked() {
if fanObservationInit {
return
}
fanObservationInit = true
fanObservation.MaxRPM = make(map[string]float64)
raw, err := os.ReadFile(fanObservationStatePath)
if err != nil || len(raw) == 0 {
return
}
var persisted fanObservationState
if json.Unmarshal(raw, &persisted) != nil {
return
}
for name, rpm := range persisted.MaxRPM {
name = strings.TrimSpace(name)
if name == "" || rpm <= 0 {
continue
}
fanObservation.MaxRPM[name] = rpm
}
}
func saveFanObservationLocked() {
if len(fanObservation.MaxRPM) == 0 {
return
}
dir := filepath.Dir(fanObservationStatePath)
if dir == "" || dir == "." {
dir = "/var/log/bee-sat"
}
if err := os.MkdirAll(dir, 0755); err != nil {
return
}
raw, err := json.MarshalIndent(fanObservation, "", " ")
if err != nil {
return
}
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
}
func updateFanObservation(fans []FanReading, now time.Time) {
if len(fans) == 0 {
return
}
fanObservationMu.Lock()
defer fanObservationMu.Unlock()
loadFanObservationLocked()
changed := false
for _, fan := range fans {
name := strings.TrimSpace(fan.Name)
if name == "" || fan.RPM <= 0 {
continue
}
currentMax := fanObservation.MaxRPM[name]
if fan.RPM <= currentMax {
delete(fanPeakCandidates, name)
continue
}
if cand, ok := fanPeakCandidates[name]; ok {
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
newMax := math.Max(cand.RPM, fan.RPM)
if newMax > currentMax {
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
changed = true
}
delete(fanPeakCandidates, name)
continue
}
if fan.RPM > cand.RPM {
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
}
continue
}
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
}
if changed {
saveFanObservationLocked()
}
}
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
if len(fans) == 0 {
return 0, false
}
fanObservationMu.Lock()
defer fanObservationMu.Unlock()
loadFanObservationLocked()
var samples []float64
for _, fan := range fans {
name := strings.TrimSpace(fan.Name)
if name == "" || fan.RPM <= 0 {
continue
}
maxRPM := fanObservation.MaxRPM[name]
if maxRPM <= 0 {
continue
}
pct := fan.RPM / maxRPM * 100.0
if pct > 100 {
pct = 100
}
if pct < 0 {
pct = 0
}
samples = append(samples, pct)
}
if len(samples) == 0 {
return 0, false
}
return benchmarkMean(samples), true
}
// parseFanSpeeds parses "ipmitool sdr type Fan" output. // parseFanSpeeds parses "ipmitool sdr type Fan" output.
// Handles two formats: // Handles two formats:
// //
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors. // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
// Returns the average duty cycle across all exposed PWM controls. // Returns the average duty cycle across all exposed PWM controls.
func sampleFanDutyCyclePct() (float64, bool) { func sampleFanDutyCyclePct() (float64, bool, bool) {
out, err := exec.Command("sensors", "-j").Output() out, err := exec.Command("sensors", "-j").Output()
if err != nil || len(out) == 0 { if err != nil || len(out) == 0 {
return 0, false fans, fanErr := sampleFanSpeeds()
if fanErr != nil {
return 0, false, false
}
return sampleFanDutyCyclePctFromFans(fans)
} }
return parseFanDutyCyclePctSensorsJSON(out) pct, ok := parseFanDutyCyclePctSensorsJSON(out)
return pct, ok, false
}
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
if len(fans) == 0 {
return 0, false, false
}
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
return pct, true, true
}
return 0, false, false
} }
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) { func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {

View File

@@ -1,6 +1,7 @@
package platform package platform
import ( import (
"path/filepath"
"testing" "testing"
"time" "time"
) )
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
} }
} }
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
t.Parallel()
oldPath := fanObservationStatePath
oldState := fanObservation
oldInit := fanObservationInit
oldCandidates := fanPeakCandidates
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
fanObservation = fanObservationState{}
fanObservationInit = false
fanPeakCandidates = make(map[string]fanPeakCandidate)
t.Cleanup(func() {
fanObservationStatePath = oldPath
fanObservation = oldState
fanObservationInit = oldInit
fanPeakCandidates = oldCandidates
})
start := time.Unix(100, 0)
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
t.Fatalf("single-sample spike should not establish observed max")
}
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
if !ok {
t.Fatalf("expected estimated duty cycle from persisted observed max")
}
if got < 43 || got > 44 {
t.Fatalf("got=%v want ~43.3", got)
}
fanObservation = fanObservationState{}
fanObservationInit = false
fanPeakCandidates = make(map[string]fanPeakCandidate)
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
if !ok {
t.Fatalf("expected persisted observed max to be reloaded from disk")
}
if got < 43 || got > 44 {
t.Fatalf("reloaded got=%v want ~43.3", got)
}
}
func TestParseDCMIPowerReading(t *testing.T) { func TestParseDCMIPowerReading(t *testing.T) {
raw := ` raw := `
Instantaneous power reading: 512 Watts Instantaneous power reading: 512 Watts

View File

@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
} }
} }
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) { func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4}) env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 2 { if len(env) != 2 {

View File

@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`, `<code>all_reduce_perf</code> (NCCL tests)`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`, `Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
)) + )) +
`</div>` + `</div>` +
`<div id="sat-card-nvidia-bandwidth">` + `<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`, `<code>nvbandwidth</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`, `Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
)) + )) +
`</div>` + `</div>` +
`</div> `</div>
@@ -1527,8 +1527,6 @@ function satModeChanged() {
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'}, {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'}, {card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'}, {card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
].forEach(function(item) { ].forEach(function(item) {
const card = document.getElementById(item.card); const card = document.getElementById(item.card);
if (card) { if (card) {
@@ -1776,7 +1774,7 @@ function runAllSAT() {
const cycles = 1; const cycles = 1;
const status = document.getElementById('sat-all-status'); const status = document.getElementById('sat-all-status');
status.textContent = 'Enqueuing...'; status.textContent = 'Enqueuing...';
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth']; const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets()); const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const activeTargets = baseTargets.filter(target => { const activeTargets = baseTargets.filter(target => {
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false; if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -2082,7 +2080,7 @@ func renderBenchmark(opts HandlerOptions) string {
</div> </div>
</div> </div>
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+` ` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
<div id="benchmark-output" style="display:none;margin-top:16px" class="card"> <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div> <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2517,7 +2515,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
func renderBurn() string { func renderBurn() string {
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div> return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div> <div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p> <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px"> <div class="card" style="margin-bottom:16px">

View File

@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
} }
} }
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA Interconnect (NCCL)`,
`Runs in Validate and Stress.`,
`NVIDIA Bandwidth (NVBandwidth)`,
`Intended to stay short enough for Validate.`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) { func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{}) handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()

View File

@@ -736,15 +736,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
break break
} }
dur := t.params.Duration archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
case "nvidia-stress": case "nvidia-stress":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")

View File

@@ -1,117 +0,0 @@
#!/bin/sh
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
set -e
echo "=== generating bee wallpaper ==="
mkdir -p /usr/share/bee
python3 - <<'PYEOF'
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import os
W, H = 1920, 1080
ASCII_ART = [
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
]
SUBTITLE = " Hardware Audit LiveCD"
FG = (0xF6, 0xD0, 0x47)
FG_DIM = (0xD4, 0xA9, 0x1C)
SHADOW = (0x5E, 0x47, 0x05)
SUB = (0x96, 0x7A, 0x17)
BG = (0x05, 0x05, 0x05)
MONO_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
]
SUB_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
]
def load_font(candidates, size):
for path in candidates:
if os.path.exists(path):
return ImageFont.truetype(path, size)
return ImageFont.load_default()
def mono_metrics(font):
probe = Image.new('L', (W, H), 0)
draw = ImageDraw.Draw(probe)
char_w = int(round(draw.textlength("M", font=font)))
bb = draw.textbbox((0, 0), "Mg", font=font)
char_h = bb[3] - bb[1]
return char_w, char_h
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
width = max(len(line) for line in lines) * char_w
height = len(lines) * char_h + line_gap * (len(lines) - 1)
mask = Image.new('L', (width, height), 0)
draw = ImageDraw.Draw(mask)
for row, line in enumerate(lines):
y = row * (char_h + line_gap)
for col, ch in enumerate(line):
if ch == ' ':
continue
x = col * char_w
draw.text((x, y), ch, font=font, fill=255)
return mask
img = Image.new('RGB', (W, H), BG)
draw = ImageDraw.Draw(img)
# Soft amber glow under the logo without depending on font rendering.
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
glow_draw = ImageDraw.Draw(glow)
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
glow = glow.filter(ImageFilter.GaussianBlur(60))
img = Image.alpha_composite(img.convert('RGBA'), glow)
TARGET_LOGO_W = 400
max_chars = max(len(line) for line in ASCII_ART)
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
_probe_cw, _ = mono_metrics(_probe_font)
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
char_w, char_h = mono_metrics(font_logo)
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
logo_w, logo_h = logo_mask.size
logo_x = (W - logo_w) // 2
logo_y = 380
sh_off = max(1, font_size_logo // 6)
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
img.paste(FG, (logo_x, logo_y), logo_mask)
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
sub_y = logo_y + logo_h + 48
draw = ImageDraw.Draw(img)
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
img = img.convert('RGB')
img.save('/usr/share/bee/wallpaper.png', optimize=True)
print('wallpaper written: /usr/share/bee/wallpaper.png')
PYEOF
echo "=== wallpaper done ==="