Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dca4afb8d0 | ||
|
|
b4280941f5 | ||
|
|
f74976ec4c | ||
|
|
18e24a9aa5 |
@@ -146,7 +146,7 @@ type satRunner interface {
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
|
||||
@@ -128,6 +128,7 @@ type fakeSAT struct {
|
||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||
runNCCLFn func(string, []int) (string, error)
|
||||
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNCCLFn != nil {
|
||||
return f.runNCCLFn(baseDir, gpuIndices)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var gotBaseDir string
|
||||
var gotGPUIndices []int
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
|
||||
gotBaseDir = baseDir
|
||||
gotGPUIndices = append([]int(nil), gpuIndices...)
|
||||
return "/tmp/nccl-tests.tar.gz", nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("RunNCCLTests error: %v", err)
|
||||
}
|
||||
if path != "/tmp/nccl-tests.tar.gz" {
|
||||
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
|
||||
}
|
||||
if gotBaseDir != "/tmp/sat" {
|
||||
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
|
||||
}
|
||||
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
|
||||
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -1122,6 +1122,7 @@ type benchmarkCoolingSample struct {
|
||||
AvgFanRPM float64
|
||||
AvgFanDutyCyclePct float64
|
||||
FanDutyCycleAvailable bool
|
||||
FanDutyCycleEstimated bool
|
||||
}
|
||||
|
||||
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
@@ -1134,6 +1135,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
samples[i].FanAvgRPM = fanSample.AvgFanRPM
|
||||
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
|
||||
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
|
||||
samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
|
||||
}
|
||||
return samples, nil
|
||||
}
|
||||
@@ -1141,11 +1143,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
|
||||
fans, _ := sampleFanSpeeds()
|
||||
avgRPM, _, _ := fanRPMStats(fans)
|
||||
dutyPct, dutyAvailable := sampleFanDutyCyclePct()
|
||||
dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
|
||||
return benchmarkCoolingSample{
|
||||
AvgFanRPM: avgRPM,
|
||||
AvgFanDutyCyclePct: dutyPct,
|
||||
FanDutyCycleAvailable: dutyAvailable,
|
||||
FanDutyCycleEstimated: dutyEstimated,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1387,25 +1390,33 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
|
||||
}
|
||||
var rpmValues []float64
|
||||
var dutyValues []float64
|
||||
var dutyEstimated bool
|
||||
for _, row := range rows {
|
||||
if row.FanAvgRPM > 0 {
|
||||
rpmValues = append(rpmValues, row.FanAvgRPM)
|
||||
}
|
||||
if row.FanDutyCycleAvailable {
|
||||
dutyValues = append(dutyValues, row.FanDutyCyclePct)
|
||||
if row.FanDutyCycleEstimated {
|
||||
dutyEstimated = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(rpmValues) == 0 && len(dutyValues) == 0 {
|
||||
return nil
|
||||
}
|
||||
summary := &BenchmarkCoolingSummary{
|
||||
Available: true,
|
||||
AvgFanRPM: benchmarkMean(rpmValues),
|
||||
Available: true,
|
||||
AvgFanRPM: benchmarkMean(rpmValues),
|
||||
FanDutyCycleEstimated: dutyEstimated,
|
||||
}
|
||||
if len(dutyValues) > 0 {
|
||||
summary.FanDutyCycleAvailable = true
|
||||
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
|
||||
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
|
||||
if summary.FanDutyCycleEstimated {
|
||||
summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
|
||||
}
|
||||
} else {
|
||||
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
|
||||
}
|
||||
@@ -3491,10 +3502,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
_ = os.MkdirAll(stepDir, 0755)
|
||||
|
||||
// Reuse the latest stable limits as starting points, but re-check every
|
||||
// active GPU in this hotter configuration.
|
||||
seedForStep := make(map[int]int, len(stableLimits))
|
||||
for k, v := range stableLimits {
|
||||
seedForStep[k] = v
|
||||
// active GPU in this hotter configuration. For the newly introduced GPU,
|
||||
// seed from its single-card calibration so we do not restart from the
|
||||
// default TDP when a prior derated limit is already known.
|
||||
seedForStep := make(map[int]int, len(subset))
|
||||
for _, idx := range subset {
|
||||
if lim, ok := stableLimits[idx]; ok && lim > 0 {
|
||||
seedForStep[idx] = lim
|
||||
continue
|
||||
}
|
||||
if base, ok := calibByIndex[idx]; ok {
|
||||
lim := int(math.Round(base.AppliedPowerLimitW))
|
||||
if lim > 0 {
|
||||
seedForStep[idx] = lim
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
|
||||
|
||||
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
|
||||
Available bool `json:"available"`
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
@@ -55,32 +56,32 @@ type NvidiaBenchmarkOptions struct {
|
||||
}
|
||||
|
||||
type NvidiaBenchmarkResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalization struct {
|
||||
@@ -223,8 +224,8 @@ type BenchmarkScorecard struct {
|
||||
|
||||
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
||||
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
||||
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
||||
|
||||
// Temperature headroom: distance to the 100°C destruction threshold.
|
||||
@@ -300,22 +301,22 @@ type NvidiaPowerBenchResult struct {
|
||||
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||
// this server under full GPU load. Use for rack power planning.
|
||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
||||
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
||||
// actual wall-power draw as seen by the server's power supply.
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchGPU struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||
// stably with all other GPUs running simultaneously at their own limits.
|
||||
@@ -333,10 +334,10 @@ type NvidiaPowerBenchGPU struct {
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||
NewGPUIndex int `json:"new_gpu_index"`
|
||||
NewGPUIndex int `json:"new_gpu_index"`
|
||||
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||
@@ -349,15 +350,15 @@ type NvidiaPowerBenchStep struct {
|
||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||
// scalability ramp-up phase of the performance benchmark.
|
||||
type NvidiaPerformanceRampStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||
ScalabilityPct float64 `json:"scalability_pct"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
ScalabilityPct float64 `json:"scalability_pct"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ type GPUMetricRow struct {
|
||||
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
|
||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
|
||||
for _, r := range rows {
|
||||
dutyAvail := 0
|
||||
if r.FanDutyCycleAvailable {
|
||||
dutyAvail = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
|
||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
|
||||
dutyEstimated := 0
|
||||
if r.FanDutyCycleEstimated {
|
||||
dutyEstimated = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
|
||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
|
||||
}
|
||||
return os.WriteFile(path, b.Bytes(), 0644)
|
||||
}
|
||||
|
||||
@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
// detect GPU count
|
||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
gpuCount := len(selected)
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, env: nvidiaVisibleDevicesEnv(selected)},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type fanObservationState struct {
|
||||
MaxRPM map[string]float64 `json:"max_rpm"`
|
||||
}
|
||||
|
||||
type fanPeakCandidate struct {
|
||||
FirstSeen time.Time
|
||||
RPM float64
|
||||
}
|
||||
|
||||
var (
|
||||
systemPowerCacheMu sync.Mutex
|
||||
systemPowerCache cachedPowerReading
|
||||
fanObservationMu sync.Mutex
|
||||
fanObservation fanObservationState
|
||||
fanObservationInit bool
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
)
|
||||
|
||||
const systemPowerHoldTTL = 15 * time.Second
|
||||
|
||||
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
|
||||
|
||||
const fanObservationMinPeakHold = time.Second
|
||||
|
||||
func normalizeObservedFanMaxRPM(rpm float64) float64 {
|
||||
if rpm <= 0 {
|
||||
return 0
|
||||
}
|
||||
return math.Ceil(rpm/1000.0) * 1000.0
|
||||
}
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||
if err == nil {
|
||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
}
|
||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||
if len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
if err != nil {
|
||||
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
return nil, sensorsErr
|
||||
}
|
||||
|
||||
func loadFanObservationLocked() {
|
||||
if fanObservationInit {
|
||||
return
|
||||
}
|
||||
fanObservationInit = true
|
||||
fanObservation.MaxRPM = make(map[string]float64)
|
||||
raw, err := os.ReadFile(fanObservationStatePath)
|
||||
if err != nil || len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
var persisted fanObservationState
|
||||
if json.Unmarshal(raw, &persisted) != nil {
|
||||
return
|
||||
}
|
||||
for name, rpm := range persisted.MaxRPM {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" || rpm <= 0 {
|
||||
continue
|
||||
}
|
||||
fanObservation.MaxRPM[name] = rpm
|
||||
}
|
||||
}
|
||||
|
||||
func saveFanObservationLocked() {
|
||||
if len(fanObservation.MaxRPM) == 0 {
|
||||
return
|
||||
}
|
||||
dir := filepath.Dir(fanObservationStatePath)
|
||||
if dir == "" || dir == "." {
|
||||
dir = "/var/log/bee-sat"
|
||||
}
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return
|
||||
}
|
||||
raw, err := json.MarshalIndent(fanObservation, "", " ")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
|
||||
}
|
||||
|
||||
func updateFanObservation(fans []FanReading, now time.Time) {
|
||||
if len(fans) == 0 {
|
||||
return
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
changed := false
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
currentMax := fanObservation.MaxRPM[name]
|
||||
if fan.RPM <= currentMax {
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if cand, ok := fanPeakCandidates[name]; ok {
|
||||
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
|
||||
newMax := math.Max(cand.RPM, fan.RPM)
|
||||
if newMax > currentMax {
|
||||
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
|
||||
changed = true
|
||||
}
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if fan.RPM > cand.RPM {
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
|
||||
}
|
||||
continue
|
||||
}
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
|
||||
}
|
||||
if changed {
|
||||
saveFanObservationLocked()
|
||||
}
|
||||
}
|
||||
|
||||
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
var samples []float64
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
maxRPM := fanObservation.MaxRPM[name]
|
||||
if maxRPM <= 0 {
|
||||
continue
|
||||
}
|
||||
pct := fan.RPM / maxRPM * 100.0
|
||||
if pct > 100 {
|
||||
pct = 100
|
||||
}
|
||||
if pct < 0 {
|
||||
pct = 0
|
||||
}
|
||||
samples = append(samples, pct)
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Handles two formats:
|
||||
//
|
||||
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||
|
||||
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||
// Returns the average duty cycle across all exposed PWM controls.
|
||||
func sampleFanDutyCyclePct() (float64, bool) {
|
||||
func sampleFanDutyCyclePct() (float64, bool, bool) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return 0, false
|
||||
fans, fanErr := sampleFanSpeeds()
|
||||
if fanErr != nil {
|
||||
return 0, false, false
|
||||
}
|
||||
return sampleFanDutyCyclePctFromFans(fans)
|
||||
}
|
||||
return parseFanDutyCyclePctSensorsJSON(out)
|
||||
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
|
||||
return pct, ok, false
|
||||
}
|
||||
|
||||
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false, false
|
||||
}
|
||||
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
|
||||
return pct, true, true
|
||||
}
|
||||
return 0, false, false
|
||||
}
|
||||
|
||||
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldPath := fanObservationStatePath
|
||||
oldState := fanObservation
|
||||
oldInit := fanObservationInit
|
||||
oldCandidates := fanPeakCandidates
|
||||
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
t.Cleanup(func() {
|
||||
fanObservationStatePath = oldPath
|
||||
fanObservation = oldState
|
||||
fanObservationInit = oldInit
|
||||
fanPeakCandidates = oldCandidates
|
||||
})
|
||||
|
||||
start := time.Unix(100, 0)
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
|
||||
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
|
||||
t.Fatalf("single-sample spike should not establish observed max")
|
||||
}
|
||||
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
|
||||
|
||||
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected estimated duty cycle from persisted observed max")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("got=%v want ~43.3", got)
|
||||
}
|
||||
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected persisted observed max to be reloaded from disk")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("reloaded got=%v want ~43.3", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDCMIPowerReading(t *testing.T) {
|
||||
raw := `
|
||||
Instantaneous power reading: 512 Watts
|
||||
|
||||
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
|
||||
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
|
||||
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
|
||||
if len(cmd) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||
}
|
||||
for i := range want {
|
||||
if cmd[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||
if len(env) != 2 {
|
||||
|
||||
@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
@@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
@@ -1527,8 +1527,6 @@ function satModeChanged() {
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
|
||||
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
@@ -1776,7 +1774,7 @@ function runAllSAT() {
|
||||
const cycles = 1;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const activeTargets = baseTargets.filter(target => {
|
||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||
@@ -2082,7 +2080,7 @@ func renderBenchmark(opts HandlerOptions) string {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
|
||||
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||||
|
||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||
@@ -2517,7 +2515,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
|
||||
@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA Interconnect (NCCL)`,
|
||||
`Runs in Validate and Stress.`,
|
||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||
`Intended to stay short enough for Validate.`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
@@ -736,15 +736,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: platform.NvidiaStressLoaderNCCL,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
}, j.append)
|
||||
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||
case "nvidia-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
#!/bin/sh
|
||||
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
|
||||
set -e
|
||||
echo "=== generating bee wallpaper ==="
|
||||
mkdir -p /usr/share/bee
|
||||
|
||||
python3 - <<'PYEOF'
|
||||
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
||||
import os
|
||||
|
||||
W, H = 1920, 1080
|
||||
|
||||
ASCII_ART = [
|
||||
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
|
||||
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
|
||||
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
|
||||
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
|
||||
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
|
||||
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
|
||||
]
|
||||
SUBTITLE = " Hardware Audit LiveCD"
|
||||
|
||||
FG = (0xF6, 0xD0, 0x47)
|
||||
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||
SHADOW = (0x5E, 0x47, 0x05)
|
||||
SUB = (0x96, 0x7A, 0x17)
|
||||
BG = (0x05, 0x05, 0x05)
|
||||
|
||||
MONO_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
|
||||
]
|
||||
SUB_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
|
||||
]
|
||||
|
||||
|
||||
def load_font(candidates, size):
|
||||
for path in candidates:
|
||||
if os.path.exists(path):
|
||||
return ImageFont.truetype(path, size)
|
||||
return ImageFont.load_default()
|
||||
|
||||
|
||||
def mono_metrics(font):
|
||||
probe = Image.new('L', (W, H), 0)
|
||||
draw = ImageDraw.Draw(probe)
|
||||
char_w = int(round(draw.textlength("M", font=font)))
|
||||
bb = draw.textbbox((0, 0), "Mg", font=font)
|
||||
char_h = bb[3] - bb[1]
|
||||
return char_w, char_h
|
||||
|
||||
|
||||
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
|
||||
width = max(len(line) for line in lines) * char_w
|
||||
height = len(lines) * char_h + line_gap * (len(lines) - 1)
|
||||
mask = Image.new('L', (width, height), 0)
|
||||
draw = ImageDraw.Draw(mask)
|
||||
for row, line in enumerate(lines):
|
||||
y = row * (char_h + line_gap)
|
||||
for col, ch in enumerate(line):
|
||||
if ch == ' ':
|
||||
continue
|
||||
x = col * char_w
|
||||
draw.text((x, y), ch, font=font, fill=255)
|
||||
return mask
|
||||
|
||||
|
||||
img = Image.new('RGB', (W, H), BG)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Soft amber glow under the logo without depending on font rendering.
|
||||
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
|
||||
glow_draw = ImageDraw.Draw(glow)
|
||||
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
|
||||
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||
|
||||
TARGET_LOGO_W = 400
|
||||
max_chars = max(len(line) for line in ASCII_ART)
|
||||
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
|
||||
_probe_cw, _ = mono_metrics(_probe_font)
|
||||
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
|
||||
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
|
||||
char_w, char_h = mono_metrics(font_logo)
|
||||
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
|
||||
logo_w, logo_h = logo_mask.size
|
||||
logo_x = (W - logo_w) // 2
|
||||
logo_y = 380
|
||||
|
||||
sh_off = max(1, font_size_logo // 6)
|
||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
|
||||
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
|
||||
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
|
||||
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||
|
||||
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
||||
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||
sub_y = logo_y + logo_h + 48
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||
|
||||
img = img.convert('RGB')
|
||||
|
||||
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
||||
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
||||
PYEOF
|
||||
|
||||
echo "=== wallpaper done ==="
|
||||
Reference in New Issue
Block a user