Add real-data duration estimates to validate tab profiles

- Add SATEstimated* constants to sat.go derived from _v8 production logs,
  with a rule to recalculate them whenever the script changes
- Extend validateInventory with NvidiaGPUCount to make estimates GPU-aware
- Update all validate card duration strings: CPU, memory, storage, NVIDIA GPU,
  targeted stress/power, pulse test, NCCL, nvbandwidth
- Fix nvbandwidth description ("intended to stay short" → actual ~45 min)
- Top-level profile labels show computed total including GPU count

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 10:51:15 +03:00
parent 7a618da1f9
commit bac89bb6e5
3 changed files with 145 additions and 18 deletions

View File

@@ -20,6 +20,54 @@ import (
"time"
)
// Estimated wall-clock durations for each SAT/validate test, derived from real
// production logs in _benchmark/_v8/.
//
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
// the corresponding Run*Pack function change, re-measure the wall-clock duration
// from actual task logs and update the matching constant here.
//
// Sources:
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 7787 s/GPU
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444448 s/GPU
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347348 s/GPU (300 s default + overhead)
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346351 s/GPU
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210384 s / 8 GPU (all simultaneous)
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 6642 688 s / 8 GPU (all simultaneous)
const (
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
SATEstimatedCPUValidateSec = 65
// CPU stress: stress-ng 1800 s (stress mode default).
SATEstimatedCPUStressSec = 1800
// RAM: memtester 256 MB / 1 pass.
SATEstimatedMemoryValidateSec = 70
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
SATEstimatedMemoryStressSec = 140
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
SATEstimatedNvidiaGPUStressPerGPUSec = 450
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
SATEstimatedNvidiaPulseTestSec = 5000
// NCCL all_reduce_perf, all GPUs simultaneously.
SATEstimatedNvidiaInterconnectSec = 300
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
SATEstimatedNvidiaBandwidthSec = 2700
)
var (
satExecCommand = exec.Command
satLookPath = exec.LookPath