Compare commits

..

4 Commits
v8.21 ... v8.23

Author SHA1 Message Date
Mikhail Chusavitin
5285c0d101 Capture per-run IPMI power and GPU telemetry in power benchmark
- Sample IPMI loaded_w per single-card calibration and per ramp step
  instead of averaging over the entire Phase 2; top-level ServerPower
  uses the final (all-GPU) ramp step value
- Add ServerLoadedW/ServerDeltaW to NvidiaPowerBenchGPU and
  NvidiaPowerBenchStep so external tooling can compare wall power per
  phase without re-parsing logs
- Write gpu-metrics.csv/.html inside each single-XX/ and step-XX/
  subdir; aggregate all phases into a top-level gpu-metrics.csv/.html
- Write 00-nvidia-smi-q.log at the start of every power run
- Add Telemetry (p95 temp/power/fan/clock) to NvidiaPowerBenchGPU in
  result.json from the converged calibration attempt
- Power benchmark page: split "Achieved W" into Single-card W and
  Multi-GPU W (StablePowerLimitW); derate highlight and status color
  now reflect the final multi-GPU limit vs nominal
- Performance benchmark page: add Status column and per-GPU score
  color coding (green/yellow/red) based on gpu.Status and OverallStatus

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-17 17:59:58 +03:00
Mikhail Chusavitin
dca4afb8d0 Seed power ramp with single-card TDP limits 2026-04-16 11:43:01 +03:00
Mikhail Chusavitin
b4280941f5 Move NCCL and NVBandwidth into validate mode 2026-04-16 11:02:30 +03:00
Mikhail Chusavitin
f74976ec4c Use static overlay wallpaper in ISO build 2026-04-16 10:54:03 +03:00
10 changed files with 284 additions and 197 deletions

View File

@@ -146,7 +146,7 @@ type satRunner interface {
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
} }
type runtimeChecker interface { type runtimeChecker interface {
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc) return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
} }
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) { func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil) path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
body := "Results: " + path body := "Results: " + path
if err != nil && err != context.Canceled { if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error() body += "\nERROR: " + err.Error()

View File

@@ -128,6 +128,7 @@ type fakeSAT struct {
runNvidiaPowerFn func(string, int, []int) (string, error) runNvidiaPowerFn func(string, int, []int) (string, error)
runNvidiaPulseFn func(string, int, []int) (string, error) runNvidiaPulseFn func(string, int, []int) (string, error)
runNvidiaBandwidthFn func(string, []int) (string, error) runNvidiaBandwidthFn func(string, []int) (string, error)
runNCCLFn func(string, []int) (string, error)
runNvidiaTargetedStressFn func(string, int, []int) (string, error) runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runMemoryFn func(string) (string, error) runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error) runStorageFn func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
return "", nil return "", nil
} }
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) { func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNCCLFn != nil {
return f.runNCCLFn(baseDir, gpuIndices)
}
return "", nil return "", nil
} }
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
t.Parallel()
var gotBaseDir string
var gotGPUIndices []int
a := &App{
sat: fakeSAT{
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
gotBaseDir = baseDir
gotGPUIndices = append([]int(nil), gpuIndices...)
return "/tmp/nccl-tests.tar.gz", nil
},
},
}
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
if err != nil {
t.Fatalf("RunNCCLTests error: %v", err)
}
if path != "/tmp/nccl-tests.tar.gz" {
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
}
if gotBaseDir != "/tmp/sat" {
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
}
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
}
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -59,6 +59,9 @@ type benchmarkPowerCalibrationResult struct {
// ≥20% while server fans were below 100% duty cycle — a signal that the // ≥20% while server fans were below 100% duty cycle — a signal that the
// cooling system may not be correctly configured for full GPU load. // cooling system may not be correctly configured for full GPU load.
CoolingWarning string CoolingWarning string
// MetricRows holds the telemetry rows from the final (converged) attempt
// for this GPU. Used to build per-run gpu-metrics.csv.
MetricRows []GPUMetricRow
} }
type benchmarkBurnProfile struct { type benchmarkBurnProfile struct {
@@ -2781,7 +2784,7 @@ func runBenchmarkPowerCalibration(
infoByIndex map[int]benchmarkGPUInfo, infoByIndex map[int]benchmarkGPUInfo,
logFunc func(string), logFunc func(string),
seedLimits map[int]int, seedLimits map[int]int,
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) { ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
const calibDurationSec = 120 const calibDurationSec = 120
const maxDerateW = 150 const maxDerateW = 150
// calibSearchTolerance is the binary-search convergence threshold in watts. // calibSearchTolerance is the binary-search convergence threshold in watts.
@@ -2795,7 +2798,7 @@ func runBenchmarkPowerCalibration(
if _, err := exec.LookPath("dcgmi"); err != nil { if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil return map[int]benchmarkPowerCalibrationResult{}, nil, nil
} }
if killed := KillTestWorkers(); len(killed) > 0 { if killed := KillTestWorkers(); len(killed) > 0 {
for _, p := range killed { for _, p := range killed {
@@ -2829,6 +2832,8 @@ func runBenchmarkPowerCalibration(
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices)) results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
var restore []benchmarkRestoreAction var restore []benchmarkRestoreAction
var allCalibRows []GPUMetricRow // accumulated telemetry across all attempts
var calibCursor float64
// Initialise per-GPU state. // Initialise per-GPU state.
states := make([]*gpuCalibState, 0, len(gpuIndices)) states := make([]*gpuCalibState, 0, len(gpuIndices))
@@ -2981,6 +2986,8 @@ calibDone:
ticker.Stop() ticker.Stop()
cancelAttempt() cancelAttempt()
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644) _ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
// Accumulate telemetry rows with attempt stage label.
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
// Resource busy: retry with exponential back-off (shared — one DCGM session). // Resource busy: retry with exponential back-off (shared — one DCGM session).
if ar.err != nil && isDCGMResourceBusy(ar.err) { if ar.err != nil && isDCGMResourceBusy(ar.err) {
@@ -3065,6 +3072,7 @@ calibDone:
} }
} }
} }
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
s.converged = true s.converged = true
continue continue
} }
@@ -3103,6 +3111,7 @@ calibDone:
} else { } else {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
} }
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
s.converged = true s.converged = true
continue continue
} }
@@ -3140,7 +3149,8 @@ calibDone:
results[s.idx] = s.calib results[s.idx] = s.calib
} }
} }
return results, restore writeBenchmarkMetricsFiles(runDir, allCalibRows)
return results, restore, allCalibRows
} }
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222), // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3230,21 +3240,25 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
} }
if len(result.RampSteps) > 0 { if len(result.RampSteps) > 0 {
b.WriteString("## Ramp Sequence\n\n") b.WriteString("## Ramp Sequence\n\n")
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n") b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
b.WriteString("|------|---------|--------------|----------------|---------|--------|\n") b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
for _, step := range result.RampSteps { for _, step := range result.RampSteps {
derated := "-" derated := "-"
if step.Derated { if step.Derated {
derated = "⚠ yes" derated = "⚠ yes"
} }
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n", serverDelta := "-"
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status) if step.ServerDeltaW > 0 {
serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
}
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
} }
b.WriteString("\n") b.WriteString("\n")
} }
b.WriteString("## Per-Slot Results\n\n") b.WriteString("## Per-Slot Results\n\n")
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n") b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n") b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
stableLimit := "-" stableLimit := "-"
if gpu.StablePowerLimitW > 0 { if gpu.StablePowerLimitW > 0 {
@@ -3254,8 +3268,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW) stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
} }
} }
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n", serverDelta := "-"
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts) if gpu.ServerDeltaW > 0 {
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
}
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
} }
b.WriteString("\n") b.WriteString("\n")
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
@@ -3284,11 +3302,19 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex) fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW) fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW) fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
if step.ServerLoadedW > 0 {
fmt.Fprintf(&b, "ramp_step_%d_server_loaded_w=%.0f\n", step.StepIndex, step.ServerLoadedW)
fmt.Fprintf(&b, "ramp_step_%d_server_delta_w=%.0f\n", step.StepIndex, step.ServerDeltaW)
}
} }
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
if gpu.StablePowerLimitW > 0 { if gpu.StablePowerLimitW > 0 {
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW) fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
} }
if gpu.ServerLoadedW > 0 {
fmt.Fprintf(&b, "gpu_%d_server_loaded_w=%.0f\n", gpu.Index, gpu.ServerLoadedW)
fmt.Fprintf(&b, "gpu_%d_server_delta_w=%.0f\n", gpu.Index, gpu.ServerDeltaW)
}
} }
if sp := result.ServerPower; sp != nil && sp.Available { if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW) fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
@@ -3327,6 +3353,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
if infoErr != nil { if infoErr != nil {
return "", infoErr return "", infoErr
} }
// Capture full nvidia-smi -q snapshot at the start of the run.
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
hostname, _ := os.Hostname() hostname, _ := os.Hostname()
result := NvidiaPowerBenchResult{ result := NvidiaPowerBenchResult{
BenchmarkVersion: benchmarkVersion, BenchmarkVersion: benchmarkVersion,
@@ -3352,13 +3382,31 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to // Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat. // establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected)) calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
singleIPMILoadedW := make(map[int]float64, len(selected))
var allRestoreActions []benchmarkRestoreAction var allRestoreActions []benchmarkRestoreAction
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
var allPowerRows []GPUMetricRow
var powerCursor float64
for _, idx := range selected { for _, idx := range selected {
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx)) singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
_ = os.MkdirAll(singleDir, 0755) _ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex) singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx)) logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil) ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
ipmiSingleDone := make(chan float64, 1)
go func() {
defer close(ipmiSingleDone)
if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
ipmiSingleDone <- w
}
}()
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
ipmiSingleCancel()
if w, ok := <-ipmiSingleDone; ok {
singleIPMILoadedW[idx] = w
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W", idx, w))
}
allRestoreActions = append(allRestoreActions, restore...) allRestoreActions = append(allRestoreActions, restore...)
if r, ok := c[idx]; ok { if r, ok := c[idx]; ok {
calibByIndex[idx] = r calibByIndex[idx] = r
@@ -3383,7 +3431,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.OverallStatus = "PARTIAL" result.OverallStatus = "PARTIAL"
} }
} }
gpus = append(gpus, NvidiaPowerBenchGPU{ gpu := NvidiaPowerBenchGPU{
Index: idx, Index: idx,
Name: info.Name, Name: info.Name,
BusID: info.BusID, BusID: info.BusID,
@@ -3396,7 +3444,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
Status: status, Status: status,
Notes: append([]string(nil), calib.Notes...), Notes: append([]string(nil), calib.Notes...),
CoolingWarning: calib.CoolingWarning, CoolingWarning: calib.CoolingWarning,
}) }
if w, ok := singleIPMILoadedW[idx]; ok && serverIdleOK && w > 0 {
gpu.ServerLoadedW = w
gpu.ServerDeltaW = w - serverIdleW
}
if len(calib.MetricRows) > 0 {
t := summarizeBenchmarkTelemetry(calib.MetricRows)
gpu.Telemetry = &t
}
gpus = append(gpus, gpu)
} }
sort.Slice(gpus, func(i, j int) bool { sort.Slice(gpus, func(i, j int) bool {
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW { if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
@@ -3445,20 +3502,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// stableLimits accumulates GPU index → fixed stable limit (W) across steps. // stableLimits accumulates GPU index → fixed stable limit (W) across steps.
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder)) stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture // serverLoadedW tracks the IPMI server power from the final ramp step
// server-side loaded power while GPUs are under stress. The goroutine is // (all GPUs simultaneously loaded). Earlier steps' values are stored
// cancelled as soon as Phase 2 finishes, and the average is used to compare // per-step in NvidiaPowerBenchStep.ServerLoadedW.
// against PlatformMaxTDPW (GPU-reported stable limits sum).
var serverLoadedW float64 var serverLoadedW float64
var serverLoadedOK bool var serverLoadedOK bool
ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
ipmiPhase2Done := make(chan float64, 1)
go func() {
defer close(ipmiPhase2Done)
if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
ipmiPhase2Done <- w
}
}()
// Step 1: reuse single-card calibration result directly. // Step 1: reuse single-card calibration result directly.
if len(result.RecommendedSlotOrder) > 0 { if len(result.RecommendedSlotOrder) > 0 {
@@ -3475,6 +3523,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
Derated: firstCalib.Derated, Derated: firstCalib.Derated,
Status: "OK", Status: "OK",
} }
if w, ok := singleIPMILoadedW[firstIdx]; ok && serverIdleOK && w > 0 {
ramp.ServerLoadedW = w
ramp.ServerDeltaW = w - serverIdleW
}
if !firstCalib.Completed { if !firstCalib.Completed {
ramp.Status = "FAILED" ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx)) ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
@@ -3502,17 +3554,45 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
_ = os.MkdirAll(stepDir, 0755) _ = os.MkdirAll(stepDir, 0755)
// Reuse the latest stable limits as starting points, but re-check every // Reuse the latest stable limits as starting points, but re-check every
// active GPU in this hotter configuration. // active GPU in this hotter configuration. For the newly introduced GPU,
seedForStep := make(map[int]int, len(stableLimits)) // seed from its single-card calibration so we do not restart from the
for k, v := range stableLimits { // default TDP when a prior derated limit is already known.
seedForStep[k] = v seedForStep := make(map[int]int, len(subset))
for _, idx := range subset {
if lim, ok := stableLimits[idx]; ok && lim > 0 {
seedForStep[idx] = lim
continue
}
if base, ok := calibByIndex[idx]; ok {
lim := int(math.Round(base.AppliedPowerLimitW))
if lim > 0 {
seedForStep[idx] = lim
}
}
} }
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d", logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx)) step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep) ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
ipmiStepDone := make(chan float64, 1)
go func() {
defer close(ipmiStepDone)
if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
ipmiStepDone <- w
}
}()
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
ipmiStepCancel()
var stepIPMILoadedW float64
var stepIPMIOK bool
if w, ok := <-ipmiStepDone; ok {
stepIPMILoadedW = w
stepIPMIOK = true
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
}
// Accumulate restore actions; they all run in the outer defer. // Accumulate restore actions; they all run in the outer defer.
allRestoreActions = append(allRestoreActions, stepRestore...) allRestoreActions = append(allRestoreActions, stepRestore...)
@@ -3575,15 +3655,17 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW)) result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
} }
result.RampSteps = append(result.RampSteps, ramp) if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
} ramp.ServerLoadedW = stepIPMILoadedW
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
// The last step has all GPUs loaded — use it as the top-level loaded_w.
if step == len(result.RecommendedSlotOrder) {
serverLoadedW = stepIPMILoadedW
serverLoadedOK = true
}
}
// Stop IPMI Phase 2 sampling and collect result. result.RampSteps = append(result.RampSteps, ramp)
ipmiPhase2Cancel()
if w, ok := <-ipmiPhase2Done; ok {
serverLoadedW = w
serverLoadedOK = true
logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
} }
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits. // Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
@@ -3613,6 +3695,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP. // ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
_ = serverIdleOK // used implicitly via characterizeServerPower _ = serverIdleOK // used implicitly via characterizeServerPower
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK) result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
// Write top-level gpu-metrics.csv/.html aggregating all phases.
writeBenchmarkMetricsFiles(runDir, allPowerRows)
resultJSON, err := json.MarshalIndent(result, "", " ") resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil { if err != nil {
return "", fmt.Errorf("marshal power result: %w", err) return "", fmt.Errorf("marshal power result: %w", err)

View File

@@ -331,6 +331,13 @@ type NvidiaPowerBenchGPU struct {
Notes []string `json:"notes,omitempty"` Notes []string `json:"notes,omitempty"`
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow. // CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
CoolingWarning string `json:"cooling_warning,omitempty"` CoolingWarning string `json:"cooling_warning,omitempty"`
// ServerLoadedW is the IPMI server power reading captured during this
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW idle.
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
// Telemetry holds the aggregated stats from the final converged calibration
// attempt for this GPU (temperature, power, fan, clock percentiles).
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
} }
type NvidiaPowerBenchStep struct { type NvidiaPowerBenchStep struct {
@@ -345,6 +352,10 @@ type NvidiaPowerBenchStep struct {
Derated bool `json:"derated,omitempty"` Derated bool `json:"derated,omitempty"`
Status string `json:"status"` Status string `json:"status"`
Notes []string `json:"notes,omitempty"` Notes []string `json:"notes,omitempty"`
// ServerLoadedW is the IPMI server power reading captured during this
// ramp step's calibration run. ServerDeltaW = ServerLoadedW idle.
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
} }
// NvidiaPerformanceRampStep holds per-step performance data for the // NvidiaPerformanceRampStep holds per-step performance data for the

View File

@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
return string(raw), err return string(raw), err
} }
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs. // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe. // Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
// detect GPU count selected, err := resolveDCGMGPUIndices(gpuIndices)
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output() if err != nil {
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n")) return "", err
}
gpuCount := len(selected)
if gpuCount < 1 { if gpuCount < 1 {
gpuCount = 1 gpuCount = 1
} }
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
satJob{name: "02-all-reduce-perf.log", cmd: []string{ satJob{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20", "-g", strconv.Itoa(gpuCount), "--iters", "20",
}}, }, env: nvidiaVisibleDevicesEnv(selected)},
), logFunc) ), logFunc)
} }

View File

@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
} }
} }
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) { func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4}) env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 2 { if len(env) != 2 {

View File

@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`, `<code>all_reduce_perf</code> (NCCL tests)`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`, `Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
)) + )) +
`</div>` + `</div>` +
`<div id="sat-card-nvidia-bandwidth">` + `<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`, `<code>nvbandwidth</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`, `Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
)) + )) +
`</div>` + `</div>` +
`</div> `</div>
@@ -1527,8 +1527,6 @@ function satModeChanged() {
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'}, {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'}, {card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'}, {card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
].forEach(function(item) { ].forEach(function(item) {
const card = document.getElementById(item.card); const card = document.getElementById(item.card);
if (card) { if (card) {
@@ -1776,7 +1774,7 @@ function runAllSAT() {
const cycles = 1; const cycles = 1;
const status = document.getElementById('sat-all-status'); const status = document.getElementById('sat-all-status');
status.textContent = 'Enqueuing...'; status.textContent = 'Enqueuing...';
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth']; const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets()); const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const activeTargets = baseTargets.filter(target => { const activeTargets = baseTargets.filter(target => {
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false; if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -2016,9 +2014,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
// ── Benchmark ───────────────────────────────────────────────────────────────── // ── Benchmark ─────────────────────────────────────────────────────────────────
type benchmarkHistoryRun struct { type benchmarkHistoryRun struct {
generatedAt time.Time generatedAt time.Time
displayTime string displayTime string
gpuScores map[int]float64 // GPU index → composite score gpuScores map[int]float64 // GPU index → composite score
gpuStatuses map[int]string // GPU index → status ("OK", "WARNING", "FAILED", …)
overallStatus string
} }
func renderBenchmark(opts HandlerOptions) string { func renderBenchmark(opts HandlerOptions) string {
@@ -2082,7 +2082,7 @@ func renderBenchmark(opts HandlerOptions) string {
</div> </div>
</div> </div>
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+` ` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
<div id="benchmark-output" style="display:none;margin-top:16px" class="card"> <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div> <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2326,7 +2326,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`) b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
} }
b.WriteString(`<div style="overflow-x:auto">`) b.WriteString(`<div style="overflow-x:auto">`)
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`) b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
for i := 0; i <= maxGPUIndex; i++ { for i := 0; i <= maxGPUIndex; i++ {
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`) b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
} }
@@ -2335,13 +2335,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
b.WriteString(`<tr>`) b.WriteString(`<tr>`)
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`) b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`) b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
overallColor := "var(--ok)"
overallLabel := run.overallStatus
if overallLabel == "" {
overallLabel = "OK"
}
if overallLabel == "FAILED" {
overallColor = "var(--crit-fg,#9f3a38)"
} else if overallLabel != "OK" {
overallColor = "var(--warn)"
}
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
for idx := 0; idx <= maxGPUIndex; idx++ { for idx := 0; idx <= maxGPUIndex; idx++ {
score, ok := run.gpuScores[idx] score, ok := run.gpuScores[idx]
if !ok { if !ok {
b.WriteString(`<td style="color:var(--muted)">-</td>`) b.WriteString(`<td style="color:var(--muted)">-</td>`)
continue continue
} }
b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`) gpuStatus := run.gpuStatuses[idx]
scoreColor := ""
switch gpuStatus {
case "FAILED":
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
case "WARNING", "PARTIAL":
scoreColor = ` style="color:var(--warn);font-weight:600"`
case "", "OK":
// no override
default:
scoreColor = ` style="color:var(--warn);font-weight:600"`
}
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
} }
b.WriteString(`</tr>`) b.WriteString(`</tr>`)
} }
@@ -2375,12 +2398,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
continue continue
} }
run := benchmarkHistoryRun{ run := benchmarkHistoryRun{
generatedAt: result.GeneratedAt, generatedAt: result.GeneratedAt,
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"), displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
gpuScores: make(map[int]float64), gpuScores: make(map[int]float64),
gpuStatuses: make(map[int]string),
overallStatus: result.OverallStatus,
} }
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
run.gpuStatuses[gpu.Index] = gpu.Status
if gpu.Index > maxGPUIndex { if gpu.Index > maxGPUIndex {
maxGPUIndex = gpu.Index maxGPUIndex = gpu.Index
} }
@@ -2449,31 +2475,45 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
if len(latest.GPUs) > 0 { if len(latest.GPUs) > 0 {
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`) b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`) b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
b.WriteString(`</tr></thead><tbody>`) b.WriteString(`</tr></thead><tbody>`)
for _, gpu := range latest.GPUs { for _, gpu := range latest.GPUs {
derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1) // finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp,
// falling back to single-card applied limit if the ramp hasn't run.
finalLimitW := gpu.StablePowerLimitW
if finalLimitW <= 0 {
finalLimitW = gpu.AppliedPowerLimitW
}
// Derate is relative to nominal (DefaultPowerLimitW), using the final limit.
derated := gpu.Derated ||
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
rowStyle := "" rowStyle := ""
achievedStyle := "" finalStyle := ""
if derated { if derated {
rowStyle = ` style="background:rgba(255,180,0,0.08)"` rowStyle = ` style="background:rgba(255,180,0,0.08)"`
achievedStyle = ` style="color:#e6a000;font-weight:600"` finalStyle = ` style="color:#e6a000;font-weight:600"`
} }
statusLabel := gpu.Status statusLabel := gpu.Status
if statusLabel == "" { if statusLabel == "" {
statusLabel = "OK" statusLabel = "OK"
} }
statusColor := "var(--ok)" statusColor := "var(--ok)"
if statusLabel != "OK" { if statusLabel == "FAILED" {
statusColor = "var(--crit-fg,#9f3a38)"
} else if statusLabel != "OK" {
statusColor = "var(--warn)" statusColor = "var(--warn)"
} }
nominalStr := "-" nominalStr := "-"
if gpu.DefaultPowerLimitW > 0 { if gpu.DefaultPowerLimitW > 0 {
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW) nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
} }
achievedStr := "-" singleStr := "-"
if gpu.AppliedPowerLimitW > 0 { if gpu.AppliedPowerLimitW > 0 {
achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW) singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
}
multiStr := "-"
if gpu.StablePowerLimitW > 0 {
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
} }
p95Str := "-" p95Str := "-"
if gpu.MaxObservedPowerW > 0 { if gpu.MaxObservedPowerW > 0 {
@@ -2483,7 +2523,8 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`) b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`) b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
b.WriteString(`<td>` + nominalStr + `</td>`) b.WriteString(`<td>` + nominalStr + `</td>`)
b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`) b.WriteString(`<td>` + singleStr + `</td>`)
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
b.WriteString(`<td>` + p95Str + `</td>`) b.WriteString(`<td>` + p95Str + `</td>`)
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`) b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
b.WriteString(`</tr>`) b.WriteString(`</tr>`)
@@ -2517,7 +2558,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
func renderBurn() string { func renderBurn() string {
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div> return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div> <div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p> <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px"> <div class="card" style="margin-bottom:16px">

View File

@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
} }
} }
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA Interconnect (NCCL)`,
`Runs in Validate and Stress.`,
`NVIDIA Bandwidth (NVBandwidth)`,
`Intended to stay short enough for Validate.`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) { func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{}) handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()

View File

@@ -736,15 +736,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
break break
} }
dur := t.params.Duration archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
case "nvidia-stress": case "nvidia-stress":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")

View File

@@ -1,117 +0,0 @@
#!/bin/sh
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
set -e
echo "=== generating bee wallpaper ==="
mkdir -p /usr/share/bee
python3 - <<'PYEOF'
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import os
W, H = 1920, 1080
ASCII_ART = [
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
]
SUBTITLE = " Hardware Audit LiveCD"
FG = (0xF6, 0xD0, 0x47)
FG_DIM = (0xD4, 0xA9, 0x1C)
SHADOW = (0x5E, 0x47, 0x05)
SUB = (0x96, 0x7A, 0x17)
BG = (0x05, 0x05, 0x05)
MONO_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
]
SUB_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
]
def load_font(candidates, size):
for path in candidates:
if os.path.exists(path):
return ImageFont.truetype(path, size)
return ImageFont.load_default()
def mono_metrics(font):
probe = Image.new('L', (W, H), 0)
draw = ImageDraw.Draw(probe)
char_w = int(round(draw.textlength("M", font=font)))
bb = draw.textbbox((0, 0), "Mg", font=font)
char_h = bb[3] - bb[1]
return char_w, char_h
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
width = max(len(line) for line in lines) * char_w
height = len(lines) * char_h + line_gap * (len(lines) - 1)
mask = Image.new('L', (width, height), 0)
draw = ImageDraw.Draw(mask)
for row, line in enumerate(lines):
y = row * (char_h + line_gap)
for col, ch in enumerate(line):
if ch == ' ':
continue
x = col * char_w
draw.text((x, y), ch, font=font, fill=255)
return mask
img = Image.new('RGB', (W, H), BG)
draw = ImageDraw.Draw(img)
# Soft amber glow under the logo without depending on font rendering.
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
glow_draw = ImageDraw.Draw(glow)
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
glow = glow.filter(ImageFilter.GaussianBlur(60))
img = Image.alpha_composite(img.convert('RGBA'), glow)
TARGET_LOGO_W = 400
max_chars = max(len(line) for line in ASCII_ART)
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
_probe_cw, _ = mono_metrics(_probe_font)
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
char_w, char_h = mono_metrics(font_logo)
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
logo_w, logo_h = logo_mask.size
logo_x = (W - logo_w) // 2
logo_y = 380
sh_off = max(1, font_size_logo // 6)
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
img.paste(FG, (logo_x, logo_y), logo_mask)
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
sub_y = logo_y + logo_h + 48
draw = ImageDraw.Draw(img)
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
img = img.convert('RGB')
img.save('/usr/share/bee/wallpaper.png', optimize=True)
print('wallpaper written: /usr/share/bee/wallpaper.png')
PYEOF
echo "=== wallpaper done ==="