From bac89bb6e5b460f185f5a5fa44a7a197b7e94467 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sat, 18 Apr 2026 10:51:15 +0300 Subject: [PATCH] Add real-data duration estimates to validate tab profiles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add SATEstimated* constants to sat.go derived from _v8 production logs, with a rule to recalculate them whenever the script changes - Extend validateInventory with NvidiaGPUCount to make estimates GPU-aware - Update all validate card duration strings: CPU, memory, storage, NVIDIA GPU, targeted stress/power, pulse test, NCCL, nvbandwidth - Fix nvbandwidth description ("intended to stay short" → actual ~45 min) - Top-level profile labels show computed total including GPU count Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/sat.go | 48 ++++++++++++ audit/internal/webui/pages.go | 111 ++++++++++++++++++++++++---- audit/internal/webui/server_test.go | 4 +- 3 files changed, 145 insertions(+), 18 deletions(-) diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index a47e474..349d5aa 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -20,6 +20,54 @@ import ( "time" ) +// Estimated wall-clock durations for each SAT/validate test, derived from real +// production logs in _benchmark/_v8/. +// +// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside +// the corresponding Run*Pack function change, re-measure the wall-clock duration +// from actual task logs and update the matching constant here. +// +// Sources: +// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s +// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s +// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU +// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU +// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead) +// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU +// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous) +// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous) +// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous) +const ( + // CPU stress: stress-ng 60 s + lscpu/sensors overhead. + SATEstimatedCPUValidateSec = 65 + // CPU stress: stress-ng 1800 s (stress mode default). + SATEstimatedCPUStressSec = 1800 + + // RAM: memtester 256 MB / 1 pass. + SATEstimatedMemoryValidateSec = 70 + // RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size). + SATEstimatedMemoryStressSec = 140 + + // NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential. + SATEstimatedNvidiaGPUValidatePerGPUSec = 85 + // NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential. + SATEstimatedNvidiaGPUStressPerGPUSec = 450 + + // NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential. + SATEstimatedNvidiaTargetedStressPerGPUSec = 350 + // NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential. + SATEstimatedNvidiaTargetedPowerPerGPUSec = 350 + + // NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU). + SATEstimatedNvidiaPulseTestSec = 5000 + + // NCCL all_reduce_perf, all GPUs simultaneously. + SATEstimatedNvidiaInterconnectSec = 300 + // nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests + // without a user-configurable time limit; duration is determined by nvbandwidth itself. + SATEstimatedNvidiaBandwidthSec = 2700 +) + var ( satExecCommand = exec.Command satLookPath = exec.LookPath diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index c7720e0..07da464 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -1378,15 +1378,64 @@ setInterval(loadMetricsLayout, 5000); // ── Validate (Acceptance Tests) ─────────────────────────────────────────────── type validateInventory struct { - CPU string - Memory string - Storage string - NVIDIA string - AMD string + CPU string + Memory string + Storage string + NVIDIA string + AMD string + NvidiaGPUCount int + AMDGPUCount int +} + +// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string. +func validateFmtDur(secs int) string { + if secs < 120 { + return fmt.Sprintf("~%d s", secs) + } + mins := (secs + 29) / 60 + return fmt.Sprintf("~%d min", mins) +} + +// validateTotalValidateSec returns the estimated wall-clock duration of +// "Validate one by one" in Validate mode for n NVIDIA GPUs. +func validateTotalValidateSec(n int) int { + if n < 0 { + n = 0 + } + total := platform.SATEstimatedCPUValidateSec + + platform.SATEstimatedMemoryValidateSec + + n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec + + platform.SATEstimatedNvidiaInterconnectSec + + platform.SATEstimatedNvidiaBandwidthSec + return total +} + +// validateTotalStressSec returns the estimated wall-clock duration of +// "Validate one by one" in Stress mode for n NVIDIA GPUs. +func validateTotalStressSec(n int) int { + if n < 0 { + n = 0 + } + total := platform.SATEstimatedCPUStressSec + + platform.SATEstimatedMemoryStressSec + + n*platform.SATEstimatedNvidiaGPUStressPerGPUSec + + n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec + + n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec + + platform.SATEstimatedNvidiaPulseTestSec + + platform.SATEstimatedNvidiaInterconnectSec + + platform.SATEstimatedNvidiaBandwidthSec + return total } func renderValidate(opts HandlerOptions) string { inv := loadValidateInventory(opts) + n := inv.NvidiaGPUCount + validateTotalStr := validateFmtDur(validateTotalValidateSec(n)) + stressTotalStr := validateFmtDur(validateTotalStressSec(n)) + gpuNote := "" + if n > 0 { + gpuNote = fmt.Sprintf(" (%d GPU)", n) + } return `
Non-destructive: Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.

Tasks continue in the background — view progress in Tasks.

@@ -1396,10 +1445,10 @@ func renderValidate(opts HandlerOptions) string {
- +
-

Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).

+

Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.

@@ -1413,19 +1462,19 @@ func renderValidate(opts HandlerOptions) string { inv.CPU, `Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`, `lscpu, sensors, stress-ng`, - `60s in Validate, 30 min in Stress.`, + validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`, )) + renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody( inv.Memory, `Runs a RAM validation pass and records memory state around the test.`, `free, memtester`, - `256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`, + validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`, )) + renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody( inv.Storage, `Scans all storage devices and runs the matching health or self-test path for each device type.`, `lsblk; NVMe: nvme; SATA/SAS: smartctl`, - `Short self-test in Validate, extended self-test in Stress.`, + `Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`, )) + `
@@ -1450,14 +1499,33 @@ func renderValidate(opts HandlerOptions) string { inv.NVIDIA, `Runs NVIDIA diagnostics and board inventory checks.`, `nvidia-smi, dmidecode, dcgmi diag`, - `Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`, + func() string { + perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec + perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec + if n > 0 { + return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).", + validateFmtDur(perV), n, validateFmtDur(perV*n), + validateFmtDur(perS), n, validateFmtDur(perS*n)) + } + return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).", + validateFmtDur(perV), validateFmtDur(perS)) + }(), )) + `
` + renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( inv.NVIDIA, `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, `dcgmi diag targeted_stress`, - `Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, + func() string { + per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec + s := "Skipped in Validate. " + if n > 0 { + s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) + } else { + s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) + } + return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` + }(), )) + `
` + `
` + @@ -1465,7 +1533,16 @@ func renderValidate(opts HandlerOptions) string { inv.NVIDIA, `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, `dcgmi diag targeted_power`, - `Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, + func() string { + per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec + s := "Skipped in Validate. " + if n > 0 { + s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) + } else { + s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) + } + return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` + }(), )) + `
` + `
` + @@ -1473,7 +1550,7 @@ func renderValidate(opts HandlerOptions) string { inv.NVIDIA, `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`, `dcgmi diag pulse_test`, - `Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, + `Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `
` + `
` + @@ -1481,7 +1558,7 @@ func renderValidate(opts HandlerOptions) string { inv.NVIDIA, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `all_reduce_perf (NCCL tests)`, - `Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`, + `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`, )) + `
` + `
` + @@ -1489,7 +1566,7 @@ func renderValidate(opts HandlerOptions) string { inv.NVIDIA, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `nvbandwidth`, - `Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`, + `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`, )) + `
` + `
@@ -1922,6 +1999,8 @@ func loadValidateInventory(opts HandlerOptions) validateInventory { out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device") out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU") out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU") + out.NvidiaGPUCount = nvidiaTotal + out.AMDGPUCount = amdTotal return out } diff --git a/audit/internal/webui/server_test.go b/audit/internal/webui/server_test.go index 763d92f..240f43a 100644 --- a/audit/internal/webui/server_test.go +++ b/audit/internal/webui/server_test.go @@ -754,9 +754,9 @@ func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) { body := rec.Body.String() for _, needle := range []string{ `NVIDIA Interconnect (NCCL)`, - `Runs in Validate and Stress.`, + `Validate and Stress:`, `NVIDIA Bandwidth (NVBandwidth)`, - `Intended to stay short enough for Validate.`, + `nvbandwidth runs all built-in tests without a time limit`, } { if !strings.Contains(body, needle) { t.Fatalf("validate page missing %q: %s", needle, body)