From 2be7ae6d28243e8571e8f56b90ea40fc75737c38 Mon Sep 17 00:00:00 2001
From: Mikhail Chusavitin <mchusavitin@mchusmbp.local>
Date: Tue, 14 Apr 2026 14:12:06 +0300
Subject: [PATCH] Refine NVIDIA benchmark phase timing

---
 audit/internal/platform/benchmark.go        | 312 +++++++++++++-------
 audit/internal/platform/benchmark_report.go |   4 +-
 audit/internal/platform/benchmark_test.go   | 107 ++++++-
 audit/internal/platform/benchmark_types.go  |   2 +-
 iso/builder/bee-gpu-stress.c                | 146 +++++++--
 iso/overlay/usr/local/bin/bee-gpu-burn      |  12 +-
 6 files changed, 450 insertions(+), 133 deletions(-)

diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go
index bddf377..6ad9754 100644
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -76,7 +76,56 @@ var (
 // benchmarkPrecisionPhases lists the precision categories run as individual
 // steady-state windows before the combined steady pass.  Order is from lowest
 // to highest power draw so thermal ramp-up is gradual.
-var benchmarkPrecisionPhases = []string{"fp8", "fp16", "fp32", "fp64", "fp4"}
+var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
+
+func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
+	switch spec.Name {
+	case NvidiaBenchmarkProfileStandard:
+		basePhaseSec = 60
+		mixedPhaseSec = 300
+	case NvidiaBenchmarkProfileStability:
+		basePhaseSec = 300
+		mixedPhaseSec = 3600
+	case NvidiaBenchmarkProfileOvernight:
+		basePhaseSec = 3600
+		mixedPhaseSec = 14400
+	default:
+		totalWeight := len(benchmarkPrecisionPhases) + 5
+		if totalWeight <= 0 {
+			return nil, nil, 0, 0
+		}
+		basePhaseSec = spec.SteadySec / totalWeight
+		if basePhaseSec <= 0 {
+			basePhaseSec = 1
+		}
+		mixedPhaseSec = basePhaseSec * 5
+	}
+	planLabels = make([]string, 0, len(benchmarkPrecisionPhases)+1)
+	planPhases = make([]benchmarkPlannedPhase, 0, len(benchmarkPrecisionPhases)+1)
+	for _, prec := range benchmarkPrecisionPhases {
+		planLabels = append(planLabels, prec)
+		planPhases = append(planPhases, benchmarkPlannedPhase{
+			PlanLabel:   prec,
+			MetricStage: metricStage(prec),
+			DurationSec: basePhaseSec,
+		})
+	}
+	planLabels = append(planLabels, "mixed")
+	planPhases = append(planPhases, benchmarkPlannedPhase{
+		PlanLabel:   "mixed",
+		MetricStage: metricStage("mixed"),
+		DurationSec: mixedPhaseSec,
+	})
+	return planLabels, planPhases, basePhaseSec, mixedPhaseSec
+}
+
+func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
+	values := make([]string, 0, len(phases))
+	for _, phase := range phases {
+		values = append(values, strconv.Itoa(phase.DurationSec))
+	}
+	return strings.Join(values, ",")
+}
 
 func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if ctx == nil {
@@ -233,42 +282,42 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				continue
 			}
 
-			// ── Per-precision stability phases ────────────────────────────────────────
-			// Run each precision category alone so PowerCVPct reflects genuine GPU
-			// power stability, not kernel-mix variance.
-			// Time budget: each phase gets steadySec/numPhases, minimum 60 s.
-			// SteadySec is split equally across all precision phases + 1 combined slot.
-			// Skipped phases (unsupported precision) are simply omitted; combined is fixed.
-			totalSlots := len(benchmarkPrecisionPhases) + 1
-			perPhaseSec := spec.SteadySec / totalSlots
-			if perPhaseSec < 60 {
-				perPhaseSec = 60
-			}
+			// Run synthetic precision phases and the combined steady phase as one
+			// uninterrupted command so the GPU stays hot between windows.
 			eccBase, _ := queryECCCounters(idx)
-			for _, prec := range benchmarkPrecisionPhases {
-				phaseCmd := []string{
-					"bee-gpu-burn",
-					"--seconds", strconv.Itoa(perPhaseSec),
-					"--size-mb", strconv.Itoa(opts.SizeMB),
-					"--devices", strconv.Itoa(idx),
-					"--precision", prec,
+			planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
+				if label == "mixed" {
+					return fmt.Sprintf("gpu-%d-steady", idx)
 				}
-				logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec))
-				phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
-				eccBefore, _ := queryECCCounters(idx)
-				phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, logFunc)
-				appendBenchmarkMetrics(&metricRows, phaseRows, phaseLogName)
-				appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
-				eccAfter, _ := queryECCCounters(idx)
-				if phaseErr != nil || len(phaseRows) == 0 {
+				return fmt.Sprintf("gpu-%d-steady-%s", idx, label)
+			})
+			planCmd := []string{
+				"bee-gpu-burn",
+				"--seconds", strconv.Itoa(basePhaseSec),
+				"--size-mb", strconv.Itoa(opts.SizeMB),
+				"--devices", strconv.Itoa(idx),
+				"--precision-plan", strings.Join(planLabels, ","),
+				"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
+			}
+			logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
+			_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
+			for _, phaseSpec := range planPhases {
+				if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
+					appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage)
+				}
+				appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
+			}
+			for _, prec := range benchmarkPrecisionPhases {
+				stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
+				phaseRows := phaseRowsByStage[stageName]
+				if len(phaseRows) == 0 {
 					continue
 				}
 				phase := BenchmarkPrecisionSteadyPhase{
 					Precision: prec,
 					Steady:    summarizeBenchmarkTelemetry(phaseRows),
-					ECC:       diffECCCounters(eccBefore, eccAfter),
 				}
-				for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles {
+				for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
 					if p.Supported {
 						phase.TeraOpsPerSec += p.TeraOpsPerSec
 						phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
@@ -278,13 +327,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			}
 
 			beforeThrottle, _ := queryThrottleCounters(idx)
-			steadyCmd := []string{
-				"bee-gpu-burn",
-				"--seconds", strconv.Itoa(perPhaseSec),
-				"--size-mb", strconv.Itoa(opts.SizeMB),
-				"--devices", strconv.Itoa(idx),
-			}
-			logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec))
+			logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
 
 			// Sample server power via IPMI in parallel with the steady phase.
 			// We collect readings every 5s and average them.
@@ -320,9 +363,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				}
 			}()
 
-			steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, logFunc)
-			appendBenchmarkMetrics(&metricRows, steadyRows, fmt.Sprintf("gpu-%d-steady", idx))
-			appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-steady", idx), steadyOut)
 			close(ipmiStopCh)
 			if loadedW, ok := <-ipmiResultCh; ok {
 				serverLoadedWSum += loadedW
@@ -331,11 +371,12 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
 			}
 			afterThrottle, _ := queryThrottleCounters(idx)
-			if steadyErr != nil {
-				gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
+			if planErr != nil {
+				gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
 			}
 
-			parseResult := parseBenchmarkBurnLog(string(steadyOut))
+			steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)]
+			parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"]))
 			gpuResult.ComputeCapability = parseResult.ComputeCapability
 			gpuResult.Backend = parseResult.Backend
 			gpuResult.PrecisionResults = parseResult.Profiles
@@ -349,17 +390,19 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
 			}
 
-			cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
-			if err != nil && err != context.Canceled {
-				gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
+			if spec.CooldownSec > 0 {
+				cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
+				if err != nil && err != context.Canceled {
+					gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
+				}
+				gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
+				appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
 			}
-			gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
-			appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
 
 			gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
 			gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
-			if steadyErr != nil {
-				gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
+			if planErr != nil {
+				gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
 			} else if parseResult.Fallback {
 				gpuResult.Status = "PARTIAL"
 			} else {
@@ -462,11 +505,11 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
 func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability:
-		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300}
+		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0}
 	case NvidiaBenchmarkProfileOvernight:
-		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300}
+		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0}
 	default:
-		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120}
+		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0}
 	}
 }
 
@@ -795,6 +838,66 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
 	return out, metricRows, err
 }
 
+type benchmarkPlannedPhase struct {
+	PlanLabel   string
+	MetricStage string
+	DurationSec int
+}
+
+func runBenchmarkPlannedCommandWithMetrics(
+	ctx context.Context,
+	verboseLog, name string,
+	cmd []string,
+	env []string,
+	gpuIndices []int,
+	phases []benchmarkPlannedPhase,
+	logFunc func(string),
+) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) {
+	out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc)
+	return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err
+}
+
+func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow {
+	out := make(map[string][]GPUMetricRow, len(phases))
+	if len(rows) == 0 || len(phases) == 0 {
+		return out
+	}
+	for _, row := range rows {
+		idx := len(phases) - 1
+		var elapsed float64
+		for i, phase := range phases {
+			durationSec := phase.DurationSec
+			if durationSec <= 0 {
+				durationSec = 1
+			}
+			elapsed += float64(durationSec)
+			if row.ElapsedSec < elapsed {
+				idx = i
+				break
+			}
+		}
+		out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row)
+	}
+	return out
+}
+
+func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte {
+	out := make(map[string][]byte)
+	var current string
+	for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") {
+		trimmed := strings.TrimSpace(stripBenchmarkPrefix(line))
+		switch {
+		case strings.HasPrefix(trimmed, "phase_begin="):
+			current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin="))
+		case strings.HasPrefix(trimmed, "phase_end="):
+			current = ""
+		case current != "":
+			out[current] = append(out[current], []byte(line+"\n")...)
+		}
+	}
+	return out
+}
+
 type benchmarkCoolingSample struct {
 	AvgFanRPM             float64
 	AvgFanDutyCyclePct    float64
@@ -968,6 +1071,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
 		category = "fp32_tf32"
 	case strings.HasPrefix(name, "fp16"):
 		category = "fp16_bf16"
+	case strings.HasPrefix(name, "int8"):
+		category = "int8"
 	case strings.HasPrefix(name, "fp8"):
 		category = "fp8"
 	case strings.HasPrefix(name, "fp4"):
@@ -985,6 +1090,7 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
 //	fp64  = 2.0  — double precision, 2× more bits per operand
 //	fp32  = 1.0  — single precision baseline
 //	fp16  = 0.5  — half precision
+//	int8  = 0.25 — quarter precision
 //	fp8   = 0.25 — quarter precision
 //	fp4   = 0.125 — eighth precision
 //
@@ -998,6 +1104,8 @@ func precisionWeight(category string) float64 {
 		return 1.0
 	case "fp16_bf16":
 		return 0.5
+	case "int8":
+		return 0.25
 	case "fp8":
 		return 0.25
 	case "fp4":
@@ -1861,41 +1969,41 @@ func runNvidiaBenchmarkParallel(
 		}
 	}
 
-	// ── Per-precision stability phases (parallel) ─────────────────────────────
-	totalSlots := len(benchmarkPrecisionPhases) + 1
-	perPhaseSec := spec.SteadySec / totalSlots
-	if perPhaseSec < 60 {
-		perPhaseSec = 60
-	}
+	// Run synthetic precision phases and the combined steady phase as one
+	// uninterrupted command so the GPUs stay hot between windows.
 	eccBase := make(map[int]BenchmarkECCCounters, len(selected))
 	for _, idx := range selected {
 		eccBase[idx], _ = queryECCCounters(idx)
 	}
+	planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
+		if label == "mixed" {
+			return "steady"
+		}
+		return "gpu-all-steady-" + label
+	})
+	planCmd := []string{
+		"bee-gpu-burn",
+		"--seconds", strconv.Itoa(basePhaseSec),
+		"--size-mb", strconv.Itoa(opts.SizeMB),
+		"--devices", allDevices,
+		"--precision-plan", strings.Join(planLabels, ","),
+		"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
+	}
+	logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
+	_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
+	for _, phaseSpec := range planPhases {
+		if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
+			appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage)
+		}
+		appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
+	}
 	for _, prec := range benchmarkPrecisionPhases {
-		phaseCmd := []string{
-			"bee-gpu-burn",
-			"--seconds", strconv.Itoa(perPhaseSec),
-			"--size-mb", strconv.Itoa(opts.SizeMB),
-			"--devices", allDevices,
-			"--precision", prec,
-		}
-		logFunc(fmt.Sprintf("GPUs %s: %s stability phase (%ds)", allDevices, prec, perPhaseSec))
 		phaseLogName := "gpu-all-steady-" + prec
-		eccBeforePhase := make(map[int]BenchmarkECCCounters, len(selected))
-		for _, idx := range selected {
-			eccBeforePhase[idx], _ = queryECCCounters(idx)
-		}
-		phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, logFunc)
-		appendBenchmarkMetrics(allMetricRows, phaseRows, phaseLogName)
-		appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
-		eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected))
-		for _, idx := range selected {
-			eccAfterPhase[idx], _ = queryECCCounters(idx)
-		}
-		if phaseErr != nil || len(phaseRows) == 0 {
+		phaseRows := phaseRowsByStage[phaseLogName]
+		if len(phaseRows) == 0 {
 			continue
 		}
-		parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseOut))
+		parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
 		for _, idx := range selected {
 			perGPU := filterRowsByGPU(phaseRows, idx)
 			if len(perGPU) == 0 {
@@ -1904,7 +2012,6 @@ func runNvidiaBenchmarkParallel(
 			phase := BenchmarkPrecisionSteadyPhase{
 				Precision: prec,
 				Steady:    summarizeBenchmarkTelemetry(perGPU),
-				ECC:       diffECCCounters(eccBeforePhase[idx], eccAfterPhase[idx]),
 			}
 			if pr, ok := parseByGPU[idx]; ok {
 				for _, p := range pr.Profiles {
@@ -1924,14 +2031,7 @@ func runNvidiaBenchmarkParallel(
 		beforeThrottle[idx], _ = queryThrottleCounters(idx)
 	}
 
-	// Steady: all GPUs simultaneously (combined). Fixed at one slot = perPhaseSec.
-	steadyCmd := []string{
-		"bee-gpu-burn",
-		"--seconds", strconv.Itoa(perPhaseSec),
-		"--size-mb", strconv.Itoa(opts.SizeMB),
-		"--devices", allDevices,
-	}
-	logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, perPhaseSec))
+	logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
 
 	// Sample server power via IPMI in parallel with steady phase.
 	ipmiStopCh := make(chan struct{})
@@ -1965,9 +2065,6 @@ func runNvidiaBenchmarkParallel(
 		}
 	}()
 
-	steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, logFunc)
-	appendBenchmarkMetrics(allMetricRows, steadyRows, "steady")
-	appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "steady", steadyOut)
 	close(ipmiStopCh)
 	if loadedW, ok := <-ipmiResultCh; ok {
 		*serverLoadedWSum += loadedW
@@ -1980,7 +2077,8 @@ func runNvidiaBenchmarkParallel(
 		afterThrottle[idx], _ = queryThrottleCounters(idx)
 	}
 
-	parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
+	steadyRows := phaseRowsByStage["steady"]
+	parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"]))
 
 	for _, idx := range selected {
 		perGPU := filterRowsByGPU(steadyRows, idx)
@@ -1998,23 +2096,25 @@ func runNvidiaBenchmarkParallel(
 				gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
 			}
 		}
-		if steadyErr != nil {
-			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
+		if planErr != nil {
+			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error())
 		}
 	}
 
 	// Cooldown: all GPUs together.
-	cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
-	if err != nil && err != context.Canceled {
-		for _, idx := range selected {
-			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
+	if spec.CooldownSec > 0 {
+		cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
+		if err != nil && err != context.Canceled {
+			for _, idx := range selected {
+				gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
+			}
 		}
+		for _, idx := range selected {
+			perGPU := filterRowsByGPU(cooldownRows, idx)
+			gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
+		}
+		appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
 	}
-	for _, idx := range selected {
-		perGPU := filterRowsByGPU(cooldownRows, idx)
-		gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
-	}
-	appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
 
 	// Score and finalize each GPU.
 	for _, idx := range selected {
@@ -2023,8 +2123,8 @@ func runNvidiaBenchmarkParallel(
 		r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
 		pr := parseResults[idx]
 		switch {
-		case steadyErr != nil:
-			r.Status = classifySATErrorStatus(steadyOut, steadyErr)
+		case planErr != nil:
+			r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
 		case pr.Fallback:
 			r.Status = "PARTIAL"
 		default:
@@ -2213,7 +2313,7 @@ func runBenchmarkPowerCalibration(
 	gpuIndices []int,
 	logFunc func(string),
 ) map[int]float64 {
-	const calibDurationSec = 45
+	const calibDurationSec = 120
 
 	// dcgmi must be present.
 	if _, err := exec.LookPath("dcgmi"); err != nil {
diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go
index 0b66d92..b75fa5d 100644
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -88,10 +88,10 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
 	b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
 	b.WriteString("**Compute score** is derived from two phases:\n\n")
-	b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
+	b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
 	b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
 	b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
-	b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
+	b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
 	b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
 	b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
 	b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go
index 1b2f08a..57219d9 100644
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
 		{
 			name:    "default",
 			profile: "",
-			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
 		},
 		{
 			name:    "stability",
 			profile: "stability",
-			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
 		},
 		{
 			name:    "overnight",
 			profile: "overnight",
-			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
 		},
 	}
 
@@ -41,6 +41,92 @@ func TestResolveBenchmarkProfile(t *testing.T) {
 	}
 }
 
+func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
+	t.Parallel()
+
+	labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
+		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
+		func(label string) string { return label },
+	)
+	if len(labels) != 7 || len(phases) != 7 {
+		t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
+	}
+	if basePhaseSec != 60 {
+		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
+	}
+	if mixedPhaseSec != 300 {
+		t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
+	}
+	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
+		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
+	}
+	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
+		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
+	}
+}
+
+func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
+	t.Parallel()
+
+	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
+		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
+		func(label string) string { return label },
+	)
+	if basePhaseSec != 300 {
+		t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
+	}
+	if mixedPhaseSec != 3600 {
+		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
+	}
+	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
+		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
+	}
+}
+
+func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
+	t.Parallel()
+
+	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
+		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
+		func(label string) string { return label },
+	)
+	if basePhaseSec != 3600 {
+		t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
+	}
+	if mixedPhaseSec != 14400 {
+		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
+	}
+	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
+		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
+	}
+}
+
+func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
+	t.Parallel()
+
+	phases := []benchmarkPlannedPhase{
+		{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
+		{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
+		{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
+	}
+	rows := []GPUMetricRow{
+		{ElapsedSec: 5},
+		{ElapsedSec: 15},
+		{ElapsedSec: 25},
+		{ElapsedSec: 65},
+	}
+	got := splitBenchmarkRowsByPlannedPhase(rows, phases)
+	if len(got["fp8"]) != 1 {
+		t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
+	}
+	if len(got["fp16"]) != 1 {
+		t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
+	}
+	if len(got["mixed"]) != 2 {
+		t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
+	}
+}
+
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()
 
@@ -65,8 +151,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
 		"[gpu 0] compute_capability=9.0",
 		"[gpu 0] backend=cublasLt",
 		"[gpu 0] duration_s=10",
+		"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
 		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
 		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
+		"[gpu 0] int8_tensor_iterations=80",
 		"[gpu 0] fp16_tensor_iterations=200",
 		"[gpu 0] fp8_e4m3_iterations=50",
 		"[gpu 0] status=OK",
@@ -79,15 +167,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
 	if got.ComputeCapability != "9.0" {
 		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
 	}
-	if len(got.Profiles) != 2 {
-		t.Fatalf("profiles=%d want 2", len(got.Profiles))
+	if len(got.Profiles) != 3 {
+		t.Fatalf("profiles=%d want 3", len(got.Profiles))
 	}
 	if got.Profiles[0].TeraOpsPerSec <= 0 {
 		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
 	}
+	if got.Profiles[0].Category != "fp16_bf16" {
+		t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
+	}
 	if got.Profiles[1].Category != "fp8" {
 		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
 	}
+	if got.Profiles[2].Category != "int8" {
+		t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
+	}
+	if got.Profiles[2].Weight != 0.25 {
+		t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
+	}
 }
 
 func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go
index bb0690b..2c1544b 100644
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -179,7 +179,7 @@ type BenchmarkPrecisionResult struct {
 	Iterations    uint64  `json:"iterations,omitempty"`
 	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
 	// Weight is the fp32-equivalence factor for this precision category.
-	// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
+	// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
 	// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
 	Weight                float64 `json:"weight,omitempty"`
 	WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
diff --git a/iso/builder/bee-gpu-stress.c b/iso/builder/bee-gpu-stress.c
index 65f0674..3bba297 100644
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -642,6 +642,20 @@ static const struct profile_desc k_profiles[] = {
         CUDA_R_16F,
         CUBLAS_COMPUTE_32F_FAST_16F,
     },
+    {
+        "int8_tensor",
+        "int8",
+        75,
+        1,
+        0,
+        0,
+        128,
+        CUDA_R_8I,
+        CUDA_R_8I,
+        CUDA_R_32I,
+        CUDA_R_32I,
+        CUBLAS_COMPUTE_32I,
+    },
     {
         "fp8_e4m3",
         "fp8",
@@ -760,10 +774,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
 static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
     switch (type) {
         case CUDA_R_32F:
+        case CUDA_R_32I:
             return (size_t)(elements * 4u);
         case CUDA_R_16F:
         case CUDA_R_16BF:
             return (size_t)(elements * 2u);
+        case CUDA_R_8I:
         case CUDA_R_8F_E4M3:
         case CUDA_R_8F_E5M2:
             return (size_t)(elements);
@@ -776,6 +792,13 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
     }
 }
 
+static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
+    if (desc->compute_type == CUBLAS_COMPUTE_32I) {
+        return CUDA_R_32I;
+    }
+    return CUDA_R_32F;
+}
+
 static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
     uint64_t row_tiles = (rows + 127u) / 128u;
     uint64_t col_tiles = (cols + 63u) / 64u;
@@ -944,8 +967,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
         return 0;
     }
 
+    cudaDataType_t scale_type = matmul_scale_type(desc);
     if (!check_cublas("cublasLtMatmulDescCreate",
-                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
+                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
         destroy_profile(cublas, cuda, out);
         return 0;
     }
@@ -1094,17 +1118,25 @@ static int prepare_profile(struct cublaslt_api *cublas,
 static int run_cublas_profile(cublasLtHandle_t handle,
                               struct cublaslt_api *cublas,
                               struct prepared_profile *profile) {
+    int32_t alpha_i32 = 1;
+    int32_t beta_i32 = 0;
     float alpha = 1.0f;
     float beta = 0.0f;
+    const void *alpha_ptr = &alpha;
+    const void *beta_ptr = &beta;
+    if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
+        alpha_ptr = &alpha_i32;
+        beta_ptr = &beta_i32;
+    }
     return check_cublas(profile->desc.name,
                         cublas->cublasLtMatmul(handle,
                                                profile->op_desc,
-                                               &alpha,
+                                               alpha_ptr,
                                                (const void *)(uintptr_t)profile->a_dev,
                                                profile->a_layout,
                                                (const void *)(uintptr_t)profile->b_dev,
                                                profile->b_layout,
-                                               &beta,
+                                               beta_ptr,
                                                (const void *)(uintptr_t)profile->c_dev,
                                                profile->c_layout,
                                                (void *)(uintptr_t)profile->d_dev,
@@ -1359,11 +1391,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
 }
 #endif
 
+static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
+    printf("device=%s\n", report->device);
+    printf("device_index=%d\n", device_index);
+    printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
+    printf("backend=%s\n", report->backend);
+    printf("duration_s=%d\n", seconds);
+    printf("buffer_mb=%d\n", report->buffer_mb);
+    printf("streams=%d\n", report->stream_count);
+    printf("iterations=%lu\n", report->iterations);
+    printf("checksum=%llu\n", (unsigned long long)report->checksum);
+    if (report->details[0] != '\0') {
+        printf("%s", report->details);
+    }
+    printf("status=OK\n");
+}
+
 int main(int argc, char **argv) {
     int seconds = 5;
     int size_mb = 64;
     int device_index = 0;
     const char *precision_filter = NULL; /* NULL = all; else block_label to match */
+    const char *precision_plan = NULL;
+    const char *precision_plan_seconds = NULL;
     for (int i = 1; i < argc; i++) {
         if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
             seconds = atoi(argv[++i]);
@@ -1373,9 +1423,13 @@ int main(int argc, char **argv) {
             device_index = atoi(argv[++i]);
         } else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
             precision_filter = argv[++i];
+        } else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
+            precision_plan = argv[++i];
+        } else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
+            precision_plan_seconds = argv[++i];
         } else {
             fprintf(stderr,
-                    "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision fp8|fp16|fp32|fp64|fp4]\n",
+                    "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
                     argv[0]);
             return 2;
         }
@@ -1436,6 +1490,76 @@ int main(int argc, char **argv) {
     int ok = 0;
 
 #if HAVE_CUBLASLT_HEADERS
+    if (precision_plan != NULL && precision_plan[0] != '\0') {
+        char *plan_copy = strdup(precision_plan);
+        char *plan_seconds_copy = NULL;
+        int phase_seconds[32] = {0};
+        int phase_seconds_count = 0;
+        int phase_ok = 0;
+        if (plan_copy == NULL) {
+            fprintf(stderr, "failed to allocate precision plan buffer\n");
+            return 1;
+        }
+        if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
+            plan_seconds_copy = strdup(precision_plan_seconds);
+            if (plan_seconds_copy == NULL) {
+                free(plan_copy);
+                fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
+                return 1;
+            }
+            for (char *sec_token = strtok(plan_seconds_copy, ",");
+                 sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
+                 sec_token = strtok(NULL, ",")) {
+                while (*sec_token == ' ' || *sec_token == '\t') {
+                    sec_token++;
+                }
+                if (*sec_token == '\0') {
+                    continue;
+                }
+                phase_seconds[phase_seconds_count++] = atoi(sec_token);
+            }
+        }
+        int phase_idx = 0;
+        for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
+            while (*token == ' ' || *token == '\t') {
+                token++;
+            }
+            if (*token == '\0') {
+                continue;
+            }
+            const char *phase_name = token;
+            const char *phase_filter = token;
+            if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
+                phase_filter = NULL;
+            }
+            int phase_duration = seconds;
+            if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
+                phase_duration = phase_seconds[phase_idx];
+            }
+            printf("phase_begin=%s\n", phase_name);
+            fflush(stdout);
+            memset(&report, 0, sizeof(report));
+            ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
+            if (ok) {
+                print_stress_report(&report, device_index, phase_duration);
+                phase_ok = 1;
+            } else {
+                printf("phase_error=%s\n", phase_name);
+                if (report.details[0] != '\0') {
+                    printf("%s", report.details);
+                    if (report.details[strlen(report.details) - 1] != '\n') {
+                        printf("\n");
+                    }
+                }
+                printf("status=FAILED\n");
+            }
+            printf("phase_end=%s\n", phase_name);
+            fflush(stdout);
+        }
+        free(plan_seconds_copy);
+        free(plan_copy);
+        return phase_ok ? 0 : 1;
+    }
     ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
 #endif
     if (!ok) {
@@ -1454,18 +1578,6 @@ int main(int argc, char **argv) {
         }
     }
 
-    printf("device=%s\n", report.device);
-    printf("device_index=%d\n", device_index);
-    printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
-    printf("backend=%s\n", report.backend);
-    printf("duration_s=%d\n", seconds);
-    printf("buffer_mb=%d\n", report.buffer_mb);
-    printf("streams=%d\n", report.stream_count);
-    printf("iterations=%lu\n", report.iterations);
-    printf("checksum=%llu\n", (unsigned long long)report.checksum);
-    if (report.details[0] != '\0') {
-        printf("%s", report.details);
-    }
-    printf("status=OK\n");
+    print_stress_report(&report, device_index, seconds);
     return 0;
 }
diff --git a/iso/overlay/usr/local/bin/bee-gpu-burn b/iso/overlay/usr/local/bin/bee-gpu-burn
index d736022..7b6018c 100755
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -7,10 +7,12 @@ SIZE_MB=0
 DEVICES=""
 EXCLUDE=""
 PRECISION=""
+PRECISION_PLAN=""
+PRECISION_PLAN_SECONDS=""
 WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
 
 usage() {
-    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision fp8|fp16|fp32|fp64|fp4]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]" >&2
     exit 2
 }
 
@@ -32,6 +34,8 @@ while [ "$#" -gt 0 ]; do
         --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
         --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
         --precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
+        --precision-plan) [ "$#" -ge 2 ] || usage; PRECISION_PLAN="$2"; shift 2 ;;
+        --precision-plan-seconds) [ "$#" -ge 2 ] || usage; PRECISION_PLAN_SECONDS="$2"; shift 2 ;;
         *) usage ;;
     esac
 done
@@ -92,8 +96,12 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
     echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
     precision_arg=""
     [ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
+    precision_plan_arg=""
+    [ -n "${PRECISION_PLAN}" ] && precision_plan_arg="--precision-plan ${PRECISION_PLAN}"
+    precision_plan_seconds_arg=""
+    [ -n "${PRECISION_PLAN_SECONDS}" ] && precision_plan_seconds_arg="--precision-plan-seconds ${PRECISION_PLAN_SECONDS}"
     CUDA_VISIBLE_DEVICES="${id}" \
-        "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} >"${log}" 2>&1 &
+        "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} ${precision_plan_arg} ${precision_plan_seconds_arg} >"${log}" 2>&1 &
     pid=$!
     WORKERS="${WORKERS} ${pid}:${id}:${log}"
     if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then