Compare commits

...

4 Commits
v8.18 ... v8.22

Author SHA1 Message Date
Mikhail Chusavitin
dca4afb8d0 Seed power ramp with single-card TDP limits 2026-04-16 11:43:01 +03:00
Mikhail Chusavitin
b4280941f5 Move NCCL and NVBandwidth into validate mode 2026-04-16 11:02:30 +03:00
Mikhail Chusavitin
f74976ec4c Use static overlay wallpaper in ISO build 2026-04-16 10:54:03 +03:00
Mikhail Chusavitin
18e24a9aa5 Estimate fan duty from observed RPM maxima 2026-04-16 10:10:18 +03:00
13 changed files with 379 additions and 199 deletions

View File

@@ -146,7 +146,7 @@ type satRunner interface {
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
}
type runtimeChecker interface {
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
body := "Results: " + path
if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error()

View File

@@ -128,6 +128,7 @@ type fakeSAT struct {
runNvidiaPowerFn func(string, int, []int) (string, error)
runNvidiaPulseFn func(string, int, []int) (string, error)
runNvidiaBandwidthFn func(string, []int) (string, error)
runNCCLFn func(string, []int) (string, error)
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
return "", nil
}
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNCCLFn != nil {
return f.runNCCLFn(baseDir, gpuIndices)
}
return "", nil
}
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
t.Parallel()
var gotBaseDir string
var gotGPUIndices []int
a := &App{
sat: fakeSAT{
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
gotBaseDir = baseDir
gotGPUIndices = append([]int(nil), gpuIndices...)
return "/tmp/nccl-tests.tar.gz", nil
},
},
}
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
if err != nil {
t.Fatalf("RunNCCLTests error: %v", err)
}
if path != "/tmp/nccl-tests.tar.gz" {
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
}
if gotBaseDir != "/tmp/sat" {
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
}
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
}
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel()

View File

@@ -1122,6 +1122,7 @@ type benchmarkCoolingSample struct {
AvgFanRPM float64
AvgFanDutyCyclePct float64
FanDutyCycleAvailable bool
FanDutyCycleEstimated bool
}
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
@@ -1134,6 +1135,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
samples[i].FanAvgRPM = fanSample.AvgFanRPM
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
}
return samples, nil
}
@@ -1141,11 +1143,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
fans, _ := sampleFanSpeeds()
avgRPM, _, _ := fanRPMStats(fans)
dutyPct, dutyAvailable := sampleFanDutyCyclePct()
dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
return benchmarkCoolingSample{
AvgFanRPM: avgRPM,
AvgFanDutyCyclePct: dutyPct,
FanDutyCycleAvailable: dutyAvailable,
FanDutyCycleEstimated: dutyEstimated,
}
}
@@ -1387,25 +1390,33 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
}
var rpmValues []float64
var dutyValues []float64
var dutyEstimated bool
for _, row := range rows {
if row.FanAvgRPM > 0 {
rpmValues = append(rpmValues, row.FanAvgRPM)
}
if row.FanDutyCycleAvailable {
dutyValues = append(dutyValues, row.FanDutyCyclePct)
if row.FanDutyCycleEstimated {
dutyEstimated = true
}
}
}
if len(rpmValues) == 0 && len(dutyValues) == 0 {
return nil
}
summary := &BenchmarkCoolingSummary{
Available: true,
AvgFanRPM: benchmarkMean(rpmValues),
Available: true,
AvgFanRPM: benchmarkMean(rpmValues),
FanDutyCycleEstimated: dutyEstimated,
}
if len(dutyValues) > 0 {
summary.FanDutyCycleAvailable = true
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
if summary.FanDutyCycleEstimated {
summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
}
} else {
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
}
@@ -3491,10 +3502,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
_ = os.MkdirAll(stepDir, 0755)
// Reuse the latest stable limits as starting points, but re-check every
// active GPU in this hotter configuration.
seedForStep := make(map[int]int, len(stableLimits))
for k, v := range stableLimits {
seedForStep[k] = v
// active GPU in this hotter configuration. For the newly introduced GPU,
// seed from its single-card calibration so we do not restart from the
// default TDP when a prior derated limit is already known.
seedForStep := make(map[int]int, len(subset))
for _, idx := range subset {
if lim, ok := stableLimits[idx]; ok && lim > 0 {
seedForStep[idx] = lim
continue
}
if base, ok := calibByIndex[idx]; ok {
lim := int(math.Round(base.AppliedPowerLimitW))
if lim > 0 {
seedForStep[idx] = lim
}
}
}
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",

View File

@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
Available bool `json:"available"`
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
Notes []string `json:"notes,omitempty"`
@@ -55,32 +56,32 @@ type NvidiaBenchmarkOptions struct {
}
type NvidiaBenchmarkResult struct {
BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"`
ServerModel string `json:"server_model,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"`
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
RampStep int `json:"ramp_step,omitempty"`
RampTotal int `json:"ramp_total,omitempty"`
RampRunID string `json:"ramp_run_id,omitempty"`
ScalabilityScore float64 `json:"scalability_score,omitempty"`
BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"`
ServerModel string `json:"server_model,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"`
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
RampStep int `json:"ramp_step,omitempty"`
RampTotal int `json:"ramp_total,omitempty"`
RampRunID string `json:"ramp_run_id,omitempty"`
ScalabilityScore float64 `json:"scalability_score,omitempty"`
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
// 100% = each added GPU contributes exactly its single-card throughput.
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
OverallStatus string `json:"overall_status"`
SelectedGPUIndices []int `json:"selected_gpu_indices"`
Findings []string `json:"findings,omitempty"`
Warnings []string `json:"warnings,omitempty"`
Normalization BenchmarkNormalization `json:"normalization"`
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
OverallStatus string `json:"overall_status"`
SelectedGPUIndices []int `json:"selected_gpu_indices"`
Findings []string `json:"findings,omitempty"`
Warnings []string `json:"warnings,omitempty"`
Normalization BenchmarkNormalization `json:"normalization"`
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
}
type BenchmarkNormalization struct {
@@ -223,8 +224,8 @@ type BenchmarkScorecard struct {
// Throttle breakdown — percentage of steady-state time in each throttle type.
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
// Temperature headroom: distance to the 100°C destruction threshold.
@@ -300,22 +301,22 @@ type NvidiaPowerBenchResult struct {
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
// cumulative thermal ramp. Represents the actual sustained power budget of
// this server under full GPU load. Use for rack power planning.
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
// ServerPower captures IPMI server power delta (idle→loaded) measured in
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
// actual wall-power draw as seen by the server's power supply.
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
}
type NvidiaPowerBenchGPU struct {
Index int `json:"index"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
Index int `json:"index"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// AppliedPowerLimitW is the stable limit found during single-card calibration.
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
// StablePowerLimitW is the final fixed limit for this GPU after the
// cumulative thermal ramp. This is the limit at which the GPU operated
// stably with all other GPUs running simultaneously at their own limits.
@@ -333,10 +334,10 @@ type NvidiaPowerBenchGPU struct {
}
type NvidiaPowerBenchStep struct {
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
// NewGPUIndex is the GPU whose stable limit was searched in this step.
NewGPUIndex int `json:"new_gpu_index"`
NewGPUIndex int `json:"new_gpu_index"`
// NewGPUStableLimitW is the stable power limit found for the new GPU.
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
@@ -349,15 +350,15 @@ type NvidiaPowerBenchStep struct {
// NvidiaPerformanceRampStep holds per-step performance data for the
// scalability ramp-up phase of the performance benchmark.
type NvidiaPerformanceRampStep struct {
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
// TOPS from dedicated single-precision phases) across all GPUs in this step.
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
ScalabilityPct float64 `json:"scalability_pct"`
Status string `json:"status"`
Notes []string `json:"notes,omitempty"`
ScalabilityPct float64 `json:"scalability_pct"`
Status string `json:"status"`
Notes []string `json:"notes,omitempty"`
}

View File

@@ -27,6 +27,7 @@ type GPUMetricRow struct {
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
}
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
for _, r := range rows {
dutyAvail := 0
if r.FanDutyCycleAvailable {
dutyAvail = 1
}
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
dutyEstimated := 0
if r.FanDutyCycleEstimated {
dutyEstimated = 1
}
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
}
return os.WriteFile(path, b.Bytes(), 0644)
}

View File

@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
return string(raw), err
}
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
// detect GPU count
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
gpuCount := len(selected)
if gpuCount < 1 {
gpuCount = 1
}
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
satJob{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20",
}},
}, env: nvidiaVisibleDevicesEnv(selected)},
), logFunc)
}

View File

@@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"os"
"os/exec"
"path/filepath"
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
UpdatedAt time.Time
}
type fanObservationState struct {
MaxRPM map[string]float64 `json:"max_rpm"`
}
type fanPeakCandidate struct {
FirstSeen time.Time
RPM float64
}
var (
systemPowerCacheMu sync.Mutex
systemPowerCache cachedPowerReading
fanObservationMu sync.Mutex
fanObservation fanObservationState
fanObservationInit bool
fanPeakCandidates = make(map[string]fanPeakCandidate)
)
const systemPowerHoldTTL = 15 * time.Second
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
const fanObservationMinPeakHold = time.Second
func normalizeObservedFanMaxRPM(rpm float64) float64 {
if rpm <= 0 {
return 0
}
return math.Ceil(rpm/1000.0) * 1000.0
}
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
if err == nil {
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
updateFanObservation(fans, time.Now())
return fans, nil
}
}
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
if len(fans) > 0 {
updateFanObservation(fans, time.Now())
return fans, nil
}
if err != nil {
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
return nil, sensorsErr
}
func loadFanObservationLocked() {
if fanObservationInit {
return
}
fanObservationInit = true
fanObservation.MaxRPM = make(map[string]float64)
raw, err := os.ReadFile(fanObservationStatePath)
if err != nil || len(raw) == 0 {
return
}
var persisted fanObservationState
if json.Unmarshal(raw, &persisted) != nil {
return
}
for name, rpm := range persisted.MaxRPM {
name = strings.TrimSpace(name)
if name == "" || rpm <= 0 {
continue
}
fanObservation.MaxRPM[name] = rpm
}
}
func saveFanObservationLocked() {
if len(fanObservation.MaxRPM) == 0 {
return
}
dir := filepath.Dir(fanObservationStatePath)
if dir == "" || dir == "." {
dir = "/var/log/bee-sat"
}
if err := os.MkdirAll(dir, 0755); err != nil {
return
}
raw, err := json.MarshalIndent(fanObservation, "", " ")
if err != nil {
return
}
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
}
func updateFanObservation(fans []FanReading, now time.Time) {
if len(fans) == 0 {
return
}
fanObservationMu.Lock()
defer fanObservationMu.Unlock()
loadFanObservationLocked()
changed := false
for _, fan := range fans {
name := strings.TrimSpace(fan.Name)
if name == "" || fan.RPM <= 0 {
continue
}
currentMax := fanObservation.MaxRPM[name]
if fan.RPM <= currentMax {
delete(fanPeakCandidates, name)
continue
}
if cand, ok := fanPeakCandidates[name]; ok {
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
newMax := math.Max(cand.RPM, fan.RPM)
if newMax > currentMax {
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
changed = true
}
delete(fanPeakCandidates, name)
continue
}
if fan.RPM > cand.RPM {
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
}
continue
}
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
}
if changed {
saveFanObservationLocked()
}
}
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
if len(fans) == 0 {
return 0, false
}
fanObservationMu.Lock()
defer fanObservationMu.Unlock()
loadFanObservationLocked()
var samples []float64
for _, fan := range fans {
name := strings.TrimSpace(fan.Name)
if name == "" || fan.RPM <= 0 {
continue
}
maxRPM := fanObservation.MaxRPM[name]
if maxRPM <= 0 {
continue
}
pct := fan.RPM / maxRPM * 100.0
if pct > 100 {
pct = 100
}
if pct < 0 {
pct = 0
}
samples = append(samples, pct)
}
if len(samples) == 0 {
return 0, false
}
return benchmarkMean(samples), true
}
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
// Handles two formats:
//
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
// Returns the average duty cycle across all exposed PWM controls.
func sampleFanDutyCyclePct() (float64, bool) {
func sampleFanDutyCyclePct() (float64, bool, bool) {
out, err := exec.Command("sensors", "-j").Output()
if err != nil || len(out) == 0 {
return 0, false
fans, fanErr := sampleFanSpeeds()
if fanErr != nil {
return 0, false, false
}
return sampleFanDutyCyclePctFromFans(fans)
}
return parseFanDutyCyclePctSensorsJSON(out)
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
return pct, ok, false
}
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
if len(fans) == 0 {
return 0, false, false
}
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
return pct, true, true
}
return 0, false, false
}
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {

View File

@@ -1,6 +1,7 @@
package platform
import (
"path/filepath"
"testing"
"time"
)
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
}
}
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
t.Parallel()
oldPath := fanObservationStatePath
oldState := fanObservation
oldInit := fanObservationInit
oldCandidates := fanPeakCandidates
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
fanObservation = fanObservationState{}
fanObservationInit = false
fanPeakCandidates = make(map[string]fanPeakCandidate)
t.Cleanup(func() {
fanObservationStatePath = oldPath
fanObservation = oldState
fanObservationInit = oldInit
fanPeakCandidates = oldCandidates
})
start := time.Unix(100, 0)
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
t.Fatalf("single-sample spike should not establish observed max")
}
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
if !ok {
t.Fatalf("expected estimated duty cycle from persisted observed max")
}
if got < 43 || got > 44 {
t.Fatalf("got=%v want ~43.3", got)
}
fanObservation = fanObservationState{}
fanObservationInit = false
fanPeakCandidates = make(map[string]fanPeakCandidate)
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
if !ok {
t.Fatalf("expected persisted observed max to be reloaded from disk")
}
if got < 43 || got > 44 {
t.Fatalf("reloaded got=%v want ~43.3", got)
}
}
func TestParseDCMIPowerReading(t *testing.T) {
raw := `
Instantaneous power reading: 512 Watts

View File

@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
}
}
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 2 {

View File

@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
)) +
`</div>` +
`<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
)) +
`</div>` +
`</div>
@@ -1527,8 +1527,6 @@ function satModeChanged() {
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
].forEach(function(item) {
const card = document.getElementById(item.card);
if (card) {
@@ -1776,7 +1774,7 @@ function runAllSAT() {
const cycles = 1;
const status = document.getElementById('sat-all-status');
status.textContent = 'Enqueuing...';
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const activeTargets = baseTargets.filter(target => {
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -2082,7 +2080,7 @@ func renderBenchmark(opts HandlerOptions) string {
</div>
</div>
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2517,7 +2515,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
func renderBurn() string {
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px">

View File

@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
}
}
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA Interconnect (NCCL)`,
`Runs in Validate and Stress.`,
`NVIDIA Bandwidth (NVBandwidth)`,
`Intended to stay short enough for Validate.`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()

View File

@@ -736,15 +736,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
case "nvidia-stress":
if a == nil {
err = fmt.Errorf("app not configured")

View File

@@ -1,117 +0,0 @@
#!/bin/sh
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
set -e
echo "=== generating bee wallpaper ==="
mkdir -p /usr/share/bee
python3 - <<'PYEOF'
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import os
W, H = 1920, 1080
ASCII_ART = [
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
]
SUBTITLE = " Hardware Audit LiveCD"
FG = (0xF6, 0xD0, 0x47)
FG_DIM = (0xD4, 0xA9, 0x1C)
SHADOW = (0x5E, 0x47, 0x05)
SUB = (0x96, 0x7A, 0x17)
BG = (0x05, 0x05, 0x05)
MONO_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
]
SUB_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
]
def load_font(candidates, size):
for path in candidates:
if os.path.exists(path):
return ImageFont.truetype(path, size)
return ImageFont.load_default()
def mono_metrics(font):
probe = Image.new('L', (W, H), 0)
draw = ImageDraw.Draw(probe)
char_w = int(round(draw.textlength("M", font=font)))
bb = draw.textbbox((0, 0), "Mg", font=font)
char_h = bb[3] - bb[1]
return char_w, char_h
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
width = max(len(line) for line in lines) * char_w
height = len(lines) * char_h + line_gap * (len(lines) - 1)
mask = Image.new('L', (width, height), 0)
draw = ImageDraw.Draw(mask)
for row, line in enumerate(lines):
y = row * (char_h + line_gap)
for col, ch in enumerate(line):
if ch == ' ':
continue
x = col * char_w
draw.text((x, y), ch, font=font, fill=255)
return mask
img = Image.new('RGB', (W, H), BG)
draw = ImageDraw.Draw(img)
# Soft amber glow under the logo without depending on font rendering.
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
glow_draw = ImageDraw.Draw(glow)
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
glow = glow.filter(ImageFilter.GaussianBlur(60))
img = Image.alpha_composite(img.convert('RGBA'), glow)
TARGET_LOGO_W = 400
max_chars = max(len(line) for line in ASCII_ART)
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
_probe_cw, _ = mono_metrics(_probe_font)
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
char_w, char_h = mono_metrics(font_logo)
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
logo_w, logo_h = logo_mask.size
logo_x = (W - logo_w) // 2
logo_y = 380
sh_off = max(1, font_size_logo // 6)
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
img.paste(FG, (logo_x, logo_y), logo_mask)
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
sub_y = logo_y + logo_h + 48
draw = ImageDraw.Draw(img)
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
img = img.convert('RGB')
img.save('/usr/share/bee/wallpaper.png', optimize=True)
print('wallpaper written: /usr/share/bee/wallpaper.png')
PYEOF
echo "=== wallpaper done ==="