diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index a47e474..349d5aa 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -20,6 +20,54 @@ import ( "time" ) +// Estimated wall-clock durations for each SAT/validate test, derived from real +// production logs in _benchmark/_v8/. +// +// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside +// the corresponding Run*Pack function change, re-measure the wall-clock duration +// from actual task logs and update the matching constant here. +// +// Sources: +// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s +// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s +// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU +// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU +// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead) +// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU +// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous) +// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous) +// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous) +const ( + // CPU stress: stress-ng 60 s + lscpu/sensors overhead. + SATEstimatedCPUValidateSec = 65 + // CPU stress: stress-ng 1800 s (stress mode default). + SATEstimatedCPUStressSec = 1800 + + // RAM: memtester 256 MB / 1 pass. + SATEstimatedMemoryValidateSec = 70 + // RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size). + SATEstimatedMemoryStressSec = 140 + + // NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential. + SATEstimatedNvidiaGPUValidatePerGPUSec = 85 + // NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential. + SATEstimatedNvidiaGPUStressPerGPUSec = 450 + + // NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential. + SATEstimatedNvidiaTargetedStressPerGPUSec = 350 + // NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential. + SATEstimatedNvidiaTargetedPowerPerGPUSec = 350 + + // NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU). + SATEstimatedNvidiaPulseTestSec = 5000 + + // NCCL all_reduce_perf, all GPUs simultaneously. + SATEstimatedNvidiaInterconnectSec = 300 + // nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests + // without a user-configurable time limit; duration is determined by nvbandwidth itself. + SATEstimatedNvidiaBandwidthSec = 2700 +) + var ( satExecCommand = exec.Command satLookPath = exec.LookPath diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index c7720e0..07da464 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -1378,15 +1378,64 @@ setInterval(loadMetricsLayout, 5000); // ── Validate (Acceptance Tests) ─────────────────────────────────────────────── type validateInventory struct { - CPU string - Memory string - Storage string - NVIDIA string - AMD string + CPU string + Memory string + Storage string + NVIDIA string + AMD string + NvidiaGPUCount int + AMDGPUCount int +} + +// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string. +func validateFmtDur(secs int) string { + if secs < 120 { + return fmt.Sprintf("~%d s", secs) + } + mins := (secs + 29) / 60 + return fmt.Sprintf("~%d min", mins) +} + +// validateTotalValidateSec returns the estimated wall-clock duration of +// "Validate one by one" in Validate mode for n NVIDIA GPUs. +func validateTotalValidateSec(n int) int { + if n < 0 { + n = 0 + } + total := platform.SATEstimatedCPUValidateSec + + platform.SATEstimatedMemoryValidateSec + + n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec + + platform.SATEstimatedNvidiaInterconnectSec + + platform.SATEstimatedNvidiaBandwidthSec + return total +} + +// validateTotalStressSec returns the estimated wall-clock duration of +// "Validate one by one" in Stress mode for n NVIDIA GPUs. +func validateTotalStressSec(n int) int { + if n < 0 { + n = 0 + } + total := platform.SATEstimatedCPUStressSec + + platform.SATEstimatedMemoryStressSec + + n*platform.SATEstimatedNvidiaGPUStressPerGPUSec + + n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec + + n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec + + platform.SATEstimatedNvidiaPulseTestSec + + platform.SATEstimatedNvidiaInterconnectSec + + platform.SATEstimatedNvidiaBandwidthSec + return total } func renderValidate(opts HandlerOptions) string { inv := loadValidateInventory(opts) + n := inv.NvidiaGPUCount + validateTotalStr := validateFmtDur(validateTotalValidateSec(n)) + stressTotalStr := validateFmtDur(validateTotalStressSec(n)) + gpuNote := "" + if n > 0 { + gpuNote = fmt.Sprintf(" (%d GPU)", n) + } return `
Tasks continue in the background — view progress in Tasks.
@@ -1396,10 +1445,10 @@ func renderValidate(opts HandlerOptions) string {Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).
+Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.
lscpu, sensors, stress-ng`,
- `60s in Validate, 30 min in Stress.`,
+ validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
)) +
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
inv.Memory,
`Runs a RAM validation pass and records memory state around the test.`,
`free, memtester`,
- `256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
+ validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
)) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage,
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
`lsblk; NVMe: nvme; SATA/SAS: smartctl`,
- `Short self-test in Validate, extended self-test in Stress.`,
+ `Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
)) +
`nvidia-smi, dmidecode, dcgmi diag`,
- `Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
+ func() string {
+ perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
+ perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
+ if n > 0 {
+ return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
+ validateFmtDur(perV), n, validateFmtDur(perV*n),
+ validateFmtDur(perS), n, validateFmtDur(perS*n))
+ }
+ return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
+ validateFmtDur(perV), validateFmtDur(perS))
+ }(),
)) +
`dcgmi diag targeted_stress`,
- `Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.Only runs in Stress mode. Switch mode above to enable in Run All.
`, + func() string { + per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec + s := "Skipped in Validate. " + if n > 0 { + s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) + } else { + s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) + } + return s + `Only runs in Stress mode. Switch mode above to enable in Run All.
` + }(), )) + `dcgmi diag targeted_power`,
- `Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.Only runs in Stress mode. Switch mode above to enable in Run All.
`, + func() string { + per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec + s := "Skipped in Validate. " + if n > 0 { + s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) + } else { + s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) + } + return s + `Only runs in Stress mode. Switch mode above to enable in Run All.
` + }(), )) + `dcgmi diag pulse_test`,
- `Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.Only runs in Stress mode. Switch mode above to enable in Run All.
`, + `Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`Only runs in Stress mode. Switch mode above to enable in Run All.
`, )) + `all_reduce_perf (NCCL tests)`,
- `Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
+ `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
)) +
`nvbandwidth`,
- `Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
+ `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
)) +
`