Add real-data duration estimates to validate tab profiles
- Add SATEstimated* constants to sat.go derived from _v8 production logs,
with a rule to recalculate them whenever the script changes
- Extend validateInventory with NvidiaGPUCount to make estimates GPU-aware
- Update all validate card duration strings: CPU, memory, storage, NVIDIA GPU,
targeted stress/power, pulse test, NCCL, nvbandwidth
- Fix nvbandwidth description ("intended to stay short" → actual ~45 min)
- Top-level profile labels show computed total including GPU count
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,6 +20,54 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for each SAT/validate test, derived from real
|
||||
// production logs in _benchmark/_v8/.
|
||||
//
|
||||
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
|
||||
// the corresponding Run*Pack function change, re-measure the wall-clock duration
|
||||
// from actual task logs and update the matching constant here.
|
||||
//
|
||||
// Sources:
|
||||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
||||
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
||||
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
||||
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
||||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||
const (
|
||||
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
|
||||
SATEstimatedCPUValidateSec = 65
|
||||
// CPU stress: stress-ng 1800 s (stress mode default).
|
||||
SATEstimatedCPUStressSec = 1800
|
||||
|
||||
// RAM: memtester 256 MB / 1 pass.
|
||||
SATEstimatedMemoryValidateSec = 70
|
||||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||
SATEstimatedMemoryStressSec = 140
|
||||
|
||||
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
||||
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
||||
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
||||
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
||||
|
||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
||||
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
||||
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
||||
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
||||
|
||||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||
SATEstimatedNvidiaPulseTestSec = 5000
|
||||
|
||||
// NCCL all_reduce_perf, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaInterconnectSec = 300
|
||||
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
|
||||
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
|
||||
SATEstimatedNvidiaBandwidthSec = 2700
|
||||
)
|
||||
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
|
||||
Reference in New Issue
Block a user