Refactor validate modes, fix benchmark report and IPMI power

- Replace diag level 1-4 dropdown with Validate/Stress radio buttons
- Validate: dcgmi L2, 60s CPU, 256MB/1p memtester, SMART short
- Stress: dcgmi L3 + targeted_stress in Run All, 30min CPU, 1GB/3p memtester, SMART long/NVMe extended
- Parallel GPU mode: spawn single task for all GPUs instead of splitting per model
- Benchmark table: per-GPU columns for sequential runs, server-wide column for parallel
- Benchmark report converted to Markdown with server model, GPU model, version in header; only steady-state charts
- Fix IPMI power parsing in benchmark (was looking for 'Current Power', correct field is 'Instantaneous power reading')

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-08 00:42:12 +03:00
parent dd26e03b2d
commit b2f8626fee
12 changed files with 332 additions and 164 deletions

View File

@@ -382,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
archive, err = application.RunNvidiaAcceptancePack("", logLine)
}
case "memory":
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
case "storage":
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
case "cpu":
dur := *duration
if dur <= 0 {