Compare commits

...

4 Commits
v8.18 ... v8.22

Author SHA1 Message Date
Mikhail Chusavitin
dca4afb8d0 Seed power ramp with single-card TDP limits 2026-04-16 11:43:01 +03:00
Mikhail Chusavitin
b4280941f5 Move NCCL and NVBandwidth into validate mode 2026-04-16 11:02:30 +03:00
Mikhail Chusavitin
f74976ec4c Use static overlay wallpaper in ISO build 2026-04-16 10:54:03 +03:00
Mikhail Chusavitin
18e24a9aa5 Estimate fan duty from observed RPM maxima 2026-04-16 10:10:18 +03:00
13 changed files with 379 additions and 199 deletions

View File

@@ -146,7 +146,7 @@ type satRunner interface {
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
} }
type runtimeChecker interface { type runtimeChecker interface {
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc) return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
} }
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) { func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil) path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
body := "Results: " + path body := "Results: " + path
if err != nil && err != context.Canceled { if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error() body += "\nERROR: " + err.Error()

View File

@@ -128,6 +128,7 @@ type fakeSAT struct {
runNvidiaPowerFn func(string, int, []int) (string, error) runNvidiaPowerFn func(string, int, []int) (string, error)
runNvidiaPulseFn func(string, int, []int) (string, error) runNvidiaPulseFn func(string, int, []int) (string, error)
runNvidiaBandwidthFn func(string, []int) (string, error) runNvidiaBandwidthFn func(string, []int) (string, error)
runNCCLFn func(string, []int) (string, error)
runNvidiaTargetedStressFn func(string, int, []int) (string, error) runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runMemoryFn func(string) (string, error) runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error) runStorageFn func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
return "", nil return "", nil
} }
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) { func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNCCLFn != nil {
return f.runNCCLFn(baseDir, gpuIndices)
}
return "", nil return "", nil
} }
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
t.Parallel()
var gotBaseDir string
var gotGPUIndices []int
a := &App{
sat: fakeSAT{
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
gotBaseDir = baseDir
gotGPUIndices = append([]int(nil), gpuIndices...)
return "/tmp/nccl-tests.tar.gz", nil
},
},
}
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
if err != nil {
t.Fatalf("RunNCCLTests error: %v", err)
}
if path != "/tmp/nccl-tests.tar.gz" {
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
}
if gotBaseDir != "/tmp/sat" {
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
}
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
}
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -1122,6 +1122,7 @@ type benchmarkCoolingSample struct {
AvgFanRPM float64 AvgFanRPM float64
AvgFanDutyCyclePct float64 AvgFanDutyCyclePct float64
FanDutyCycleAvailable bool FanDutyCycleAvailable bool
FanDutyCycleEstimated bool
} }
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) { func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
@@ -1134,6 +1135,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
samples[i].FanAvgRPM = fanSample.AvgFanRPM samples[i].FanAvgRPM = fanSample.AvgFanRPM
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
} }
return samples, nil return samples, nil
} }
@@ -1141,11 +1143,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
func sampleBenchmarkCoolingSample() benchmarkCoolingSample { func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
fans, _ := sampleFanSpeeds() fans, _ := sampleFanSpeeds()
avgRPM, _, _ := fanRPMStats(fans) avgRPM, _, _ := fanRPMStats(fans)
dutyPct, dutyAvailable := sampleFanDutyCyclePct() dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
return benchmarkCoolingSample{ return benchmarkCoolingSample{
AvgFanRPM: avgRPM, AvgFanRPM: avgRPM,
AvgFanDutyCyclePct: dutyPct, AvgFanDutyCyclePct: dutyPct,
FanDutyCycleAvailable: dutyAvailable, FanDutyCycleAvailable: dutyAvailable,
FanDutyCycleEstimated: dutyEstimated,
} }
} }
@@ -1387,12 +1390,16 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
} }
var rpmValues []float64 var rpmValues []float64
var dutyValues []float64 var dutyValues []float64
var dutyEstimated bool
for _, row := range rows { for _, row := range rows {
if row.FanAvgRPM > 0 { if row.FanAvgRPM > 0 {
rpmValues = append(rpmValues, row.FanAvgRPM) rpmValues = append(rpmValues, row.FanAvgRPM)
} }
if row.FanDutyCycleAvailable { if row.FanDutyCycleAvailable {
dutyValues = append(dutyValues, row.FanDutyCyclePct) dutyValues = append(dutyValues, row.FanDutyCyclePct)
if row.FanDutyCycleEstimated {
dutyEstimated = true
}
} }
} }
if len(rpmValues) == 0 && len(dutyValues) == 0 { if len(rpmValues) == 0 && len(dutyValues) == 0 {
@@ -1401,11 +1408,15 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
summary := &BenchmarkCoolingSummary{ summary := &BenchmarkCoolingSummary{
Available: true, Available: true,
AvgFanRPM: benchmarkMean(rpmValues), AvgFanRPM: benchmarkMean(rpmValues),
FanDutyCycleEstimated: dutyEstimated,
} }
if len(dutyValues) > 0 { if len(dutyValues) > 0 {
summary.FanDutyCycleAvailable = true summary.FanDutyCycleAvailable = true
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues) summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95) summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
if summary.FanDutyCycleEstimated {
summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
}
} else { } else {
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected") summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
} }
@@ -3491,10 +3502,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
_ = os.MkdirAll(stepDir, 0755) _ = os.MkdirAll(stepDir, 0755)
// Reuse the latest stable limits as starting points, but re-check every // Reuse the latest stable limits as starting points, but re-check every
// active GPU in this hotter configuration. // active GPU in this hotter configuration. For the newly introduced GPU,
seedForStep := make(map[int]int, len(stableLimits)) // seed from its single-card calibration so we do not restart from the
for k, v := range stableLimits { // default TDP when a prior derated limit is already known.
seedForStep[k] = v seedForStep := make(map[int]int, len(subset))
for _, idx := range subset {
if lim, ok := stableLimits[idx]; ok && lim > 0 {
seedForStep[idx] = lim
continue
}
if base, ok := calibByIndex[idx]; ok {
lim := int(math.Round(base.AppliedPowerLimitW))
if lim > 0 {
seedForStep[idx] = lim
}
}
} }
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d", logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",

View File

@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
Available bool `json:"available"` Available bool `json:"available"`
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"` AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"` AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"` P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
Notes []string `json:"notes,omitempty"` Notes []string `json:"notes,omitempty"`

View File

@@ -27,6 +27,7 @@ type GPUMetricRow struct {
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"` FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"` FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
} }
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// WriteGPUMetricsCSV writes collected rows as a CSV file. // WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer var b bytes.Buffer
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n") b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
for _, r := range rows { for _, r := range rows {
dutyAvail := 0 dutyAvail := 0
if r.FanDutyCycleAvailable { if r.FanDutyCycleAvailable {
dutyAvail = 1 dutyAvail = 1
} }
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n", dutyEstimated := 0
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail) if r.FanDutyCycleEstimated {
dutyEstimated = 1
}
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
} }
return os.WriteFile(path, b.Bytes(), 0644) return os.WriteFile(path, b.Bytes(), 0644)
} }

View File

@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
return string(raw), err return string(raw), err
} }
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs. // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe. // Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
// detect GPU count selected, err := resolveDCGMGPUIndices(gpuIndices)
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output() if err != nil {
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n")) return "", err
}
gpuCount := len(selected)
if gpuCount < 1 { if gpuCount < 1 {
gpuCount = 1 gpuCount = 1
} }
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
satJob{name: "02-all-reduce-perf.log", cmd: []string{ satJob{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20", "-g", strconv.Itoa(gpuCount), "--iters", "20",
}}, }, env: nvidiaVisibleDevicesEnv(selected)},
), logFunc) ), logFunc)
} }

View File

@@ -4,6 +4,7 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"math"
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
UpdatedAt time.Time UpdatedAt time.Time
} }
type fanObservationState struct {
MaxRPM map[string]float64 `json:"max_rpm"`
}
type fanPeakCandidate struct {
FirstSeen time.Time
RPM float64
}
var ( var (
systemPowerCacheMu sync.Mutex systemPowerCacheMu sync.Mutex
systemPowerCache cachedPowerReading systemPowerCache cachedPowerReading
fanObservationMu sync.Mutex
fanObservation fanObservationState
fanObservationInit bool
fanPeakCandidates = make(map[string]fanPeakCandidate)
) )
const systemPowerHoldTTL = 15 * time.Second const systemPowerHoldTTL = 15 * time.Second
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
const fanObservationMinPeakHold = time.Second
func normalizeObservedFanMaxRPM(rpm float64) float64 {
if rpm <= 0 {
return 0
}
return math.Ceil(rpm/1000.0) * 1000.0
}
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds, // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv. // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling. // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output() out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
if err == nil { if err == nil {
if fans := parseFanSpeeds(string(out)); len(fans) > 0 { if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
updateFanObservation(fans, time.Now())
return fans, nil return fans, nil
} }
} }
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON() fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
if len(fans) > 0 { if len(fans) > 0 {
updateFanObservation(fans, time.Now())
return fans, nil return fans, nil
} }
if err != nil { if err != nil {
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
return nil, sensorsErr return nil, sensorsErr
} }
func loadFanObservationLocked() {
if fanObservationInit {
return
}
fanObservationInit = true
fanObservation.MaxRPM = make(map[string]float64)
raw, err := os.ReadFile(fanObservationStatePath)
if err != nil || len(raw) == 0 {
return
}
var persisted fanObservationState
if json.Unmarshal(raw, &persisted) != nil {
return
}
for name, rpm := range persisted.MaxRPM {
name = strings.TrimSpace(name)
if name == "" || rpm <= 0 {
continue
}
fanObservation.MaxRPM[name] = rpm
}
}
func saveFanObservationLocked() {
if len(fanObservation.MaxRPM) == 0 {
return
}
dir := filepath.Dir(fanObservationStatePath)
if dir == "" || dir == "." {
dir = "/var/log/bee-sat"
}
if err := os.MkdirAll(dir, 0755); err != nil {
return
}
raw, err := json.MarshalIndent(fanObservation, "", " ")
if err != nil {
return
}
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
}
func updateFanObservation(fans []FanReading, now time.Time) {
if len(fans) == 0 {
return
}
fanObservationMu.Lock()
defer fanObservationMu.Unlock()
loadFanObservationLocked()
changed := false
for _, fan := range fans {
name := strings.TrimSpace(fan.Name)
if name == "" || fan.RPM <= 0 {
continue
}
currentMax := fanObservation.MaxRPM[name]
if fan.RPM <= currentMax {
delete(fanPeakCandidates, name)
continue
}
if cand, ok := fanPeakCandidates[name]; ok {
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
newMax := math.Max(cand.RPM, fan.RPM)
if newMax > currentMax {
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
changed = true
}
delete(fanPeakCandidates, name)
continue
}
if fan.RPM > cand.RPM {
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
}
continue
}
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
}
if changed {
saveFanObservationLocked()
}
}
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
if len(fans) == 0 {
return 0, false
}
fanObservationMu.Lock()
defer fanObservationMu.Unlock()
loadFanObservationLocked()
var samples []float64
for _, fan := range fans {
name := strings.TrimSpace(fan.Name)
if name == "" || fan.RPM <= 0 {
continue
}
maxRPM := fanObservation.MaxRPM[name]
if maxRPM <= 0 {
continue
}
pct := fan.RPM / maxRPM * 100.0
if pct > 100 {
pct = 100
}
if pct < 0 {
pct = 0
}
samples = append(samples, pct)
}
if len(samples) == 0 {
return 0, false
}
return benchmarkMean(samples), true
}
// parseFanSpeeds parses "ipmitool sdr type Fan" output. // parseFanSpeeds parses "ipmitool sdr type Fan" output.
// Handles two formats: // Handles two formats:
// //
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors. // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
// Returns the average duty cycle across all exposed PWM controls. // Returns the average duty cycle across all exposed PWM controls.
func sampleFanDutyCyclePct() (float64, bool) { func sampleFanDutyCyclePct() (float64, bool, bool) {
out, err := exec.Command("sensors", "-j").Output() out, err := exec.Command("sensors", "-j").Output()
if err != nil || len(out) == 0 { if err != nil || len(out) == 0 {
return 0, false fans, fanErr := sampleFanSpeeds()
if fanErr != nil {
return 0, false, false
} }
return parseFanDutyCyclePctSensorsJSON(out) return sampleFanDutyCyclePctFromFans(fans)
}
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
return pct, ok, false
}
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
if len(fans) == 0 {
return 0, false, false
}
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
return pct, true, true
}
return 0, false, false
} }
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) { func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {

View File

@@ -1,6 +1,7 @@
package platform package platform
import ( import (
"path/filepath"
"testing" "testing"
"time" "time"
) )
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
} }
} }
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
t.Parallel()
oldPath := fanObservationStatePath
oldState := fanObservation
oldInit := fanObservationInit
oldCandidates := fanPeakCandidates
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
fanObservation = fanObservationState{}
fanObservationInit = false
fanPeakCandidates = make(map[string]fanPeakCandidate)
t.Cleanup(func() {
fanObservationStatePath = oldPath
fanObservation = oldState
fanObservationInit = oldInit
fanPeakCandidates = oldCandidates
})
start := time.Unix(100, 0)
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
t.Fatalf("single-sample spike should not establish observed max")
}
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
if !ok {
t.Fatalf("expected estimated duty cycle from persisted observed max")
}
if got < 43 || got > 44 {
t.Fatalf("got=%v want ~43.3", got)
}
fanObservation = fanObservationState{}
fanObservationInit = false
fanPeakCandidates = make(map[string]fanPeakCandidate)
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
if !ok {
t.Fatalf("expected persisted observed max to be reloaded from disk")
}
if got < 43 || got > 44 {
t.Fatalf("reloaded got=%v want ~43.3", got)
}
}
func TestParseDCMIPowerReading(t *testing.T) { func TestParseDCMIPowerReading(t *testing.T) {
raw := ` raw := `
Instantaneous power reading: 512 Watts Instantaneous power reading: 512 Watts

View File

@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
} }
} }
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) { func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4}) env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 2 { if len(env) != 2 {

View File

@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`, `<code>all_reduce_perf</code> (NCCL tests)`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`, `Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
)) + )) +
`</div>` + `</div>` +
`<div id="sat-card-nvidia-bandwidth">` + `<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`, `<code>nvbandwidth</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`, `Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
)) + )) +
`</div>` + `</div>` +
`</div> `</div>
@@ -1527,8 +1527,6 @@ function satModeChanged() {
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'}, {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'}, {card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'}, {card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
].forEach(function(item) { ].forEach(function(item) {
const card = document.getElementById(item.card); const card = document.getElementById(item.card);
if (card) { if (card) {
@@ -1776,7 +1774,7 @@ function runAllSAT() {
const cycles = 1; const cycles = 1;
const status = document.getElementById('sat-all-status'); const status = document.getElementById('sat-all-status');
status.textContent = 'Enqueuing...'; status.textContent = 'Enqueuing...';
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth']; const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets()); const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const activeTargets = baseTargets.filter(target => { const activeTargets = baseTargets.filter(target => {
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false; if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -2082,7 +2080,7 @@ func renderBenchmark(opts HandlerOptions) string {
</div> </div>
</div> </div>
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+` ` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
<div id="benchmark-output" style="display:none;margin-top:16px" class="card"> <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div> <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2517,7 +2515,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
func renderBurn() string { func renderBurn() string {
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div> return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div> <div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p> <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px"> <div class="card" style="margin-bottom:16px">

View File

@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
} }
} }
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA Interconnect (NCCL)`,
`Runs in Validate and Stress.`,
`NVIDIA Bandwidth (NVBandwidth)`,
`Intended to stay short enough for Validate.`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) { func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{}) handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()

View File

@@ -736,15 +736,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
break break
} }
dur := t.params.Duration archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
case "nvidia-stress": case "nvidia-stress":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")

View File

@@ -1,117 +0,0 @@
#!/bin/sh
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
set -e
echo "=== generating bee wallpaper ==="
mkdir -p /usr/share/bee
python3 - <<'PYEOF'
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import os
W, H = 1920, 1080
ASCII_ART = [
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
]
SUBTITLE = " Hardware Audit LiveCD"
FG = (0xF6, 0xD0, 0x47)
FG_DIM = (0xD4, 0xA9, 0x1C)
SHADOW = (0x5E, 0x47, 0x05)
SUB = (0x96, 0x7A, 0x17)
BG = (0x05, 0x05, 0x05)
MONO_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
]
SUB_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
]
def load_font(candidates, size):
for path in candidates:
if os.path.exists(path):
return ImageFont.truetype(path, size)
return ImageFont.load_default()
def mono_metrics(font):
probe = Image.new('L', (W, H), 0)
draw = ImageDraw.Draw(probe)
char_w = int(round(draw.textlength("M", font=font)))
bb = draw.textbbox((0, 0), "Mg", font=font)
char_h = bb[3] - bb[1]
return char_w, char_h
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
width = max(len(line) for line in lines) * char_w
height = len(lines) * char_h + line_gap * (len(lines) - 1)
mask = Image.new('L', (width, height), 0)
draw = ImageDraw.Draw(mask)
for row, line in enumerate(lines):
y = row * (char_h + line_gap)
for col, ch in enumerate(line):
if ch == ' ':
continue
x = col * char_w
draw.text((x, y), ch, font=font, fill=255)
return mask
img = Image.new('RGB', (W, H), BG)
draw = ImageDraw.Draw(img)
# Soft amber glow under the logo without depending on font rendering.
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
glow_draw = ImageDraw.Draw(glow)
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
glow = glow.filter(ImageFilter.GaussianBlur(60))
img = Image.alpha_composite(img.convert('RGBA'), glow)
TARGET_LOGO_W = 400
max_chars = max(len(line) for line in ASCII_ART)
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
_probe_cw, _ = mono_metrics(_probe_font)
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
char_w, char_h = mono_metrics(font_logo)
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
logo_w, logo_h = logo_mask.size
logo_x = (W - logo_w) // 2
logo_y = 380
sh_off = max(1, font_size_logo // 6)
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
img.paste(FG, (logo_x, logo_y), logo_mask)
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
sub_y = logo_y + logo_h + 48
draw = ImageDraw.Draw(img)
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
img = img.convert('RGB')
img.save('/usr/share/bee/wallpaper.png', optimize=True)
print('wallpaper written: /usr/share/bee/wallpaper.png')
PYEOF
echo "=== wallpaper done ==="