Fix benchmark scoring: PowerSustain uses default power limit

PowerSustainScore now uses DefaultPowerLimitW as reference so a manually reduced power limit does not inflate the score. Falls back to enforced limit if default is unavailable. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Enhance benchmark: server power via IPMI, efficiency metrics, FP64, power limit check
2026-04-06 22:30:59 +03:00 · 2026-04-06 22:26:52 +03:00 · 2026-04-06 21:06:21 +03:00 · 2026-04-06 21:06:16 +03:00 · 2026-04-06 16:33:16 +03:00 · 2026-04-06 16:27:13 +03:00
24 changed files with 1674 additions and 190 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -27,14 +27,17 @@ type benchmarkProfileSpec struct {
 }

 type benchmarkGPUInfo struct {
-	Index               int
-	UUID                string
-	Name                string
-	BusID               string
-	VBIOS               string
-	PowerLimitW         float64
-	MaxGraphicsClockMHz float64
-	MaxMemoryClockMHz   float64
+	Index                int
+	UUID                 string
+	Name                 string
+	BusID                string
+	VBIOS                string
+	PowerLimitW          float64
+	DefaultPowerLimitW   float64
+	MaxGraphicsClockMHz  float64
+	MaxMemoryClockMHz    float64
+	BaseGraphicsClockMHz float64
+	MultiprocessorCount  int
 }

 type benchmarkBurnProfile struct {
@@ -111,6 +114,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv

 	logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))

+	// Server power characterization state — populated during per-GPU phases.
+	var serverIdleW, serverLoadedWSum float64
+	var serverIdleOK, serverLoadedOK bool
+	var serverLoadedSamples int
+
 	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
 	if infoErr != nil {
 		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
@@ -146,7 +154,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			gpuResult.BusID = info.BusID
 			gpuResult.VBIOS = info.VBIOS
 			gpuResult.PowerLimitW = info.PowerLimitW
+			gpuResult.MultiprocessorCount = info.MultiprocessorCount
+			gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
 			gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
+			gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
@@ -161,6 +172,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
 		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)

+		// Sample server idle power once (first GPU only — server state is global).
+		if !serverIdleOK {
+			if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
+				serverIdleW = w
+				serverIdleOK = true
+				logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+			}
+		}
+
 		warmupCmd := []string{
 			"bee-gpu-burn",
 			"--seconds", strconv.Itoa(spec.WarmupSec),
@@ -184,7 +204,50 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			"--devices", strconv.Itoa(idx),
 		}
 		logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
+
+		// Sample server power via IPMI in parallel with the steady phase.
+		// We collect readings every 5s and average them.
+		ipmiStopCh := make(chan struct{})
+		ipmiResultCh := make(chan float64, 1)
+		go func() {
+			defer close(ipmiResultCh)
+			var samples []float64
+			ticker := time.NewTicker(5 * time.Second)
+			defer ticker.Stop()
+			// First sample after a short warmup delay.
+			select {
+			case <-ipmiStopCh:
+				return
+			case <-time.After(15 * time.Second):
+			}
+			for {
+				if w, err := queryIPMIServerPowerW(); err == nil {
+					samples = append(samples, w)
+				}
+				select {
+				case <-ipmiStopCh:
+					if len(samples) > 0 {
+						var sum float64
+						for _, w := range samples {
+							sum += w
+						}
+						ipmiResultCh <- sum / float64(len(samples))
+					}
+					return
+				case <-ticker.C:
+				}
+			}
+		}()
+
 		steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
+		close(ipmiStopCh)
+		if loadedW, ok := <-ipmiResultCh; ok {
+			serverLoadedWSum += loadedW
+			serverLoadedSamples++
+			serverLoadedOK = true
+			logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
+		}
+
 		_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
 		afterThrottle, _ := queryThrottleCounters(idx)
 		if steadyErr != nil {
@@ -232,6 +295,17 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}

+	// Compute server power characterization from accumulated IPMI samples.
+	var gpuReportedSumW float64
+	for _, gpu := range result.GPUs {
+		gpuReportedSumW += gpu.Steady.AvgPowerW
+	}
+	var serverLoadedW float64
+	if serverLoadedSamples > 0 {
+		serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
+	}
+	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
+
 	result.Findings = buildBenchmarkFindings(result)
 	result.OverallStatus = benchmarkOverallStatus(result)

@@ -243,7 +317,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		return "", fmt.Errorf("write result.json: %w", err)
 	}

-	report := renderBenchmarkReport(result)
+	report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
 	if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
 		return "", fmt.Errorf("write report.txt: %w", err)
 	}
@@ -290,7 +364,7 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {

 func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
 	args := []string{
-		"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
+		"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
 		"--format=csv,noheader,nounits",
 	}
 	if len(gpuIndices) > 0 {
@@ -311,14 +385,14 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {

 	infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
 	for _, row := range rows {
-		if len(row) < 8 {
+		if len(row) < 9 {
 			continue
 		}
 		idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
 		if err != nil {
 			continue
 		}
-		infoByIndex[idx] = benchmarkGPUInfo{
+		info := benchmarkGPUInfo{
 			Index:               idx,
 			UUID:                strings.TrimSpace(row[1]),
 			Name:                strings.TrimSpace(row[2]),
@@ -328,6 +402,16 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
 			MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
 			MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
 		}
+		if len(row) >= 9 {
+			info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
+		}
+		if len(row) >= 10 {
+			info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
+		}
+		if len(row) >= 11 {
+			info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
+		}
+		infoByIndex[idx] = info
 	}
 	return infoByIndex, nil
 }
@@ -551,6 +635,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
 	}
 	category := "other"
 	switch {
+	case strings.HasPrefix(name, "fp64"):
+		category = "fp64"
 	case strings.HasPrefix(name, "fp32"):
 		category = "fp32_tf32"
 	case strings.HasPrefix(name, "fp16"):
@@ -619,14 +705,23 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 			score.ComputeScore += precision.TeraOpsPerSec
 		}
 	}
-	if gpu.PowerLimitW > 0 {
-		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/gpu.PowerLimitW)*100)
+	// Use default power limit for sustain score so a manually reduced limit
+	// does not inflate the score. Fall back to enforced limit if default unknown.
+	referencePowerW := gpu.DefaultPowerLimitW
+	if referencePowerW <= 0 {
+		referencePowerW = gpu.PowerLimitW
+	}
+	if referencePowerW > 0 {
+		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/referencePowerW)*100)
 	}
 	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
 	thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
 	score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
 	score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
 	score.CompositeScore = compositeBenchmarkScore(score)
+	if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
+		score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
+	}
 	return score
 }

@@ -679,7 +774,10 @@ func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gp
 		"-g", strconv.Itoa(len(gpuIndices)),
 		"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
 	}
-	env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
+	env := []string{
+		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
+		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
+	}
 	logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
 	out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
@@ -795,10 +893,30 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {

 func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 	var findings []string
+
+	passed := 0
+	for _, gpu := range result.GPUs {
+		if gpu.Status == "OK" {
+			passed++
+		}
+	}
+	total := len(result.GPUs)
+	if total > 0 {
+		if passed == total {
+			findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
+		} else {
+			findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
+		}
+	}
+
 	if result.Normalization.Status != "full" {
 		findings = append(findings, "Environment normalization was partial; compare results with caution.")
 	}
 	for _, gpu := range result.GPUs {
+		if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
+			findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
+			continue
+		}
 		if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
 			findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
 			continue
@@ -822,10 +940,24 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 		if gpu.Backend == "driver-ptx" {
 			findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
 		}
+		if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
+			findings = append(findings, fmt.Sprintf(
+				"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
+				gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
+			))
+		}
 	}
 	if result.Interconnect != nil && result.Interconnect.Supported {
 		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
 	}
+	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
+		if sp.ReportingRatio < 0.75 {
+			findings = append(findings, fmt.Sprintf(
+				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
+				sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
+			))
+		}
+	}
 	return dedupeStrings(findings)
 }

@@ -1004,3 +1136,76 @@ func maxInt(a, b int) int {
 	}
 	return b
 }
+
+// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
+// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
+func queryIPMIServerPowerW() (float64, error) {
+	out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
+	if err != nil {
+		return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
+	}
+	for _, line := range strings.Split(string(out), "\n") {
+		if strings.Contains(line, "Current Power") {
+			parts := strings.SplitN(line, ":", 2)
+			if len(parts) == 2 {
+				val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
+				val = strings.TrimSpace(val)
+				w, err := strconv.ParseFloat(val, 64)
+				if err == nil && w > 0 {
+					return w, nil
+				}
+			}
+		}
+	}
+	return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
+}
+
+// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
+// durationSec seconds. Returns the mean of all successful samples.
+// Returns 0, false if IPMI is unavailable.
+func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
+	if durationSec <= 0 {
+		return 0, false
+	}
+	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
+	var samples []float64
+	for {
+		if w, err := queryIPMIServerPowerW(); err == nil {
+			samples = append(samples, w)
+		}
+		if time.Now().After(deadline) {
+			break
+		}
+		select {
+		case <-ctx.Done():
+			break
+		case <-time.After(2 * time.Second):
+		}
+	}
+	if len(samples) == 0 {
+		return 0, false
+	}
+	var sum float64
+	for _, w := range samples {
+		sum += w
+	}
+	return sum / float64(len(samples)), true
+}
+
+// characterizeServerPower computes BenchmarkServerPower from idle and loaded
+// IPMI samples plus the GPU-reported average power during steady state.
+func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
+	sp := &BenchmarkServerPower{Available: ipmiAvailable}
+	if !ipmiAvailable {
+		sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
+		return sp
+	}
+	sp.IdleW = idleW
+	sp.LoadedW = loadedW
+	sp.DeltaW = loadedW - idleW
+	sp.GPUReportedSumW = gpuReportedSumW
+	if gpuReportedSumW > 0 && sp.DeltaW > 0 {
+		sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
+	}
+	return sp
+}
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -2,11 +2,25 @@ package platform

 import (
 	"fmt"
+	"os"
+	"path/filepath"
+	"regexp"
 	"strings"
 	"time"
 )

 func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
+	return renderBenchmarkReportWithCharts(result, nil)
+}
+
+type benchmarkReportChart struct {
+	Title   string
+	Content string
+}
+
+var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
+
+func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
 	fmt.Fprintf(&b, "===========================\n\n")
@@ -42,6 +56,9 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
 		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
 		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
 		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
+		if gpu.Scores.TOPSPerSMPerGHz > 0 {
+			fmt.Fprintf(&b, "  Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
+		}
 		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
 		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
 		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
@@ -63,13 +80,7 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
 				}
 			}
 		}
-		fmt.Fprintf(&b, "  Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
-			gpu.Throttle.SWPowerCapUS,
-			gpu.Throttle.SWThermalSlowdownUS,
-			gpu.Throttle.SyncBoostUS,
-			gpu.Throttle.HWThermalSlowdownUS,
-			gpu.Throttle.HWPowerBrakeSlowdownUS,
-		)
+		fmt.Fprintf(&b, "  Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
 		if len(gpu.Notes) > 0 {
 			fmt.Fprintf(&b, "  Notes:\n")
 			for _, note := range gpu.Notes {
@@ -93,6 +104,40 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")
 	}

+	if len(charts) > 0 {
+		fmt.Fprintf(&b, "Terminal Charts\n")
+		fmt.Fprintf(&b, "---------------\n")
+		for _, chart := range charts {
+			content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
+			if content == "" {
+				continue
+			}
+			fmt.Fprintf(&b, "%s\n", chart.Title)
+			fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
+			fmt.Fprintf(&b, "%s\n\n", content)
+		}
+	}
+
+	if sp := result.ServerPower; sp != nil {
+		fmt.Fprintf(&b, "Server Power (IPMI)\n")
+		fmt.Fprintf(&b, "-------------------\n")
+		if !sp.Available {
+			fmt.Fprintf(&b, "Unavailable\n")
+		} else {
+			fmt.Fprintf(&b, "  Server idle:         %.0f W\n", sp.IdleW)
+			fmt.Fprintf(&b, "  Server under load:   %.0f W\n", sp.LoadedW)
+			fmt.Fprintf(&b, "  Server delta:        %.0f W\n", sp.DeltaW)
+			fmt.Fprintf(&b, "  GPU reported (sum):  %.0f W\n", sp.GPUReportedSumW)
+			if sp.ReportingRatio > 0 {
+				fmt.Fprintf(&b, "  Reporting ratio:     %.2f  (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
+			}
+		}
+		for _, note := range sp.Notes {
+			fmt.Fprintf(&b, "  Note: %s\n", note)
+		}
+		b.WriteString("\n")
+	}
+
 	fmt.Fprintf(&b, "Methodology\n")
 	fmt.Fprintf(&b, "-----------\n")
 	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
@@ -117,6 +162,72 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
 	return b.String()
 }

+func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
+	phases := []struct {
+		name  string
+		label string
+	}{
+		{name: "baseline", label: "Baseline"},
+		{name: "steady", label: "Steady State"},
+		{name: "cooldown", label: "Cooldown"},
+	}
+	var charts []benchmarkReportChart
+	for _, idx := range gpuIndices {
+		for _, phase := range phases {
+			path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name))
+			raw, err := os.ReadFile(path)
+			if err != nil || len(raw) == 0 {
+				continue
+			}
+			charts = append(charts, benchmarkReportChart{
+				Title:   fmt.Sprintf("GPU %d %s", idx, phase.label),
+				Content: string(raw),
+			})
+		}
+	}
+	return charts
+}
+
+func stripANSIEscapeSequences(raw string) string {
+	return ansiEscapePattern.ReplaceAllString(raw, "")
+}
+
+// formatThrottleLine renders throttle counters as human-readable percentages of
+// the steady-state window.  Only non-zero counters are shown.  When the steady
+// duration is unknown (0), raw seconds are shown instead.
+func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
+	type counter struct {
+		label string
+		us    uint64
+	}
+	counters := []counter{
+		{"sw_power", t.SWPowerCapUS},
+		{"sw_thermal", t.SWThermalSlowdownUS},
+		{"sync_boost", t.SyncBoostUS},
+		{"hw_thermal", t.HWThermalSlowdownUS},
+		{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
+	}
+	var parts []string
+	for _, c := range counters {
+		if c.us == 0 {
+			continue
+		}
+		sec := float64(c.us) / 1e6
+		if steadyDurationSec > 0 {
+			pct := sec / steadyDurationSec * 100
+			parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
+		} else if sec < 1 {
+			parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
+		} else {
+			parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
+		}
+	}
+	if len(parts) == 0 {
+		return "none"
+	}
+	return strings.Join(parts, "  ")
+}
+
 func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -145,3 +145,35 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
 		}
 	}
 }
+
+func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
+	t.Parallel()
+
+	report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
+		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
+		OverallStatus:      "OK",
+		SelectedGPUIndices: []int{0},
+		Normalization: BenchmarkNormalization{
+			Status: "full",
+		},
+	}, []benchmarkReportChart{
+		{
+			Title:   "GPU 0 Steady State",
+			Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
+		},
+	})
+
+	for _, needle := range []string{
+		"Terminal Charts",
+		"GPU 0 Steady State",
+		"GPU 0 chart",
+		"42┤───",
+	} {
+		if !strings.Contains(report, needle) {
+			t.Fatalf("report missing %q\n%s", needle, report)
+		}
+	}
+	if strings.Contains(report, "\x1b[31m") {
+		t.Fatalf("report should not contain ANSI escapes\n%s", report)
+	}
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -28,6 +28,7 @@ type NvidiaBenchmarkResult struct {
 	Normalization      BenchmarkNormalization       `json:"normalization"`
 	GPUs               []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
 }

 type BenchmarkNormalization struct {
@@ -56,7 +57,10 @@ type BenchmarkGPUResult struct {
 	Backend                string                     `json:"backend,omitempty"`
 	Status                 string                     `json:"status"`
 	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
+	MultiprocessorCount    int                        `json:"multiprocessor_count,omitempty"`
+	DefaultPowerLimitW     float64                    `json:"default_power_limit_w,omitempty"`
 	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
+	BaseGraphicsClockMHz   float64                    `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
 	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
 	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
@@ -117,6 +121,24 @@ type BenchmarkScorecard struct {
 	StabilityScore      float64 `json:"stability_score"`
 	InterconnectScore   float64 `json:"interconnect_score"`
 	CompositeScore      float64 `json:"composite_score"`
+	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
+	// Comparable across throttle levels and GPU generations. Low value at normal
+	// clocks indicates silicon degradation.
+	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
+}
+
+// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
+// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
+// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
+// over-reporting its power consumption.
+type BenchmarkServerPower struct {
+	Available       bool     `json:"available"`
+	IdleW           float64  `json:"idle_w,omitempty"`
+	LoadedW         float64  `json:"loaded_w,omitempty"`
+	DeltaW          float64  `json:"delta_w,omitempty"`
+	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
+	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
+	Notes           []string `json:"notes,omitempty"`
 }

 type BenchmarkInterconnectResult struct {
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -630,7 +630,10 @@ func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
 	if len(gpuIndices) == 0 {
 		return nil
 	}
-	return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
+	return []string{
+		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
+		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
+	}
 }

 func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
@@ -671,6 +674,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
 			return "", writeErr
 		}
+		if ctx.Err() != nil {
+			return "", ctx.Err()
+		}
 		status, rc := classifySATResult(job.name, out, err)
 		stats.Add(status)
 		key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -1,12 +1,14 @@
 package platform

 import (
+	"context"
 	"errors"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 )

 func TestStorageSATCommands(t *testing.T) {
@@ -253,11 +255,14 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {

 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
-	if len(env) != 1 {
-		t.Fatalf("env len=%d want 1 (%v)", len(env), env)
+	if len(env) != 2 {
+		t.Fatalf("env len=%d want 2 (%v)", len(env), env)
 	}
-	if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
-		t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
+	if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
+		t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
+	}
+	if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
+		t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
 	}
 }

@@ -350,6 +355,38 @@ func TestClassifySATResult(t *testing.T) {
 	}
 }

+func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
+	dir := t.TempDir()
+	ctx, cancel := context.WithCancel(context.Background())
+	t.Cleanup(cancel)
+
+	done := make(chan struct{})
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		cancel()
+		close(done)
+	}()
+
+	archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
+		{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
+	}, nil)
+	<-done
+
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("err=%v want context.Canceled", err)
+	}
+	if archive != "" {
+		t.Fatalf("archive=%q want empty", archive)
+	}
+	matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
+	if globErr != nil {
+		t.Fatalf("Glob error: %v", globErr)
+	}
+	if len(matches) != 0 {
+		t.Fatalf("archives=%v want none", matches)
+	}
+}
+
 func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -11,6 +11,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"regexp"
+	"sort"
 	"strings"
 	"sync/atomic"
 	"syscall"
@@ -21,13 +22,238 @@ import (
 )

 var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
+var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
+	if a == nil {
+		return nil, fmt.Errorf("app not configured")
+	}
+	return a.ListNvidiaGPUs()
+}

 // ── Job ID counter ────────────────────────────────────────────────────────────

 var jobCounter atomic.Uint64

-func newJobID(prefix string) string {
-	return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
+func newJobID(_ string) string {
+	start := int((jobCounter.Add(1) - 1) % 1000)
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	for offset := 0; offset < 1000; offset++ {
+		n := (start + offset) % 1000
+		id := fmt.Sprintf("TASK-%03d", n)
+		if !taskIDInUseLocked(id) {
+			return id
+		}
+	}
+	return fmt.Sprintf("TASK-%03d", start)
+}
+
+func taskIDInUseLocked(id string) bool {
+	for _, t := range globalQueue.tasks {
+		if t != nil && t.ID == id {
+			return true
+		}
+	}
+	return false
+}
+
+type taskRunResponse struct {
+	TaskID    string   `json:"task_id,omitempty"`
+	JobID     string   `json:"job_id,omitempty"`
+	TaskIDs   []string `json:"task_ids,omitempty"`
+	JobIDs    []string `json:"job_ids,omitempty"`
+	TaskCount int      `json:"task_count,omitempty"`
+}
+
+type nvidiaTaskSelection struct {
+	GPUIndices []int
+	Label      string
+}
+
+func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
+	if len(tasks) == 0 {
+		writeJSON(w, taskRunResponse{})
+		return
+	}
+	ids := make([]string, 0, len(tasks))
+	for _, t := range tasks {
+		if t == nil || strings.TrimSpace(t.ID) == "" {
+			continue
+		}
+		ids = append(ids, t.ID)
+	}
+	resp := taskRunResponse{TaskCount: len(ids)}
+	if len(ids) > 0 {
+		resp.TaskID = ids[0]
+		resp.JobID = ids[0]
+		resp.TaskIDs = ids
+		resp.JobIDs = ids
+	}
+	writeJSON(w, resp)
+}
+
+func shouldSplitHomogeneousNvidiaTarget(target string) bool {
+	switch strings.TrimSpace(target) {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
+		"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
+		"nvidia-bandwidth", "nvidia-stress":
+		return true
+	default:
+		return false
+	}
+}
+
+func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
+	if len(gpus) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs detected")
+	}
+	indexed := make(map[int]platform.NvidiaGPU, len(gpus))
+	allIndices := make([]int, 0, len(gpus))
+	for _, gpu := range gpus {
+		indexed[gpu.Index] = gpu
+		allIndices = append(allIndices, gpu.Index)
+	}
+	sort.Ints(allIndices)
+
+	selected := allIndices
+	if len(include) > 0 {
+		selected = make([]int, 0, len(include))
+		seen := make(map[int]struct{}, len(include))
+		for _, idx := range include {
+			if _, ok := indexed[idx]; !ok {
+				continue
+			}
+			if _, dup := seen[idx]; dup {
+				continue
+			}
+			seen[idx] = struct{}{}
+			selected = append(selected, idx)
+		}
+		sort.Ints(selected)
+	}
+	if len(exclude) > 0 {
+		skip := make(map[int]struct{}, len(exclude))
+		for _, idx := range exclude {
+			skip[idx] = struct{}{}
+		}
+		filtered := selected[:0]
+		for _, idx := range selected {
+			if _, ok := skip[idx]; ok {
+				continue
+			}
+			filtered = append(filtered, idx)
+		}
+		selected = filtered
+	}
+	if len(selected) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs selected")
+	}
+
+	modelGroups := make(map[string][]platform.NvidiaGPU)
+	modelOrder := make([]string, 0)
+	for _, idx := range selected {
+		gpu := indexed[idx]
+		model := strings.TrimSpace(gpu.Name)
+		if model == "" {
+			model = fmt.Sprintf("GPU %d", gpu.Index)
+		}
+		if _, ok := modelGroups[model]; !ok {
+			modelOrder = append(modelOrder, model)
+		}
+		modelGroups[model] = append(modelGroups[model], gpu)
+	}
+	sort.Slice(modelOrder, func(i, j int) bool {
+		left := modelGroups[modelOrder[i]]
+		right := modelGroups[modelOrder[j]]
+		if len(left) == 0 || len(right) == 0 {
+			return modelOrder[i] < modelOrder[j]
+		}
+		return left[0].Index < right[0].Index
+	})
+
+	var groups []nvidiaTaskSelection
+	var singles []nvidiaTaskSelection
+	for _, model := range modelOrder {
+		group := modelGroups[model]
+		sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
+		indices := make([]int, 0, len(group))
+		for _, gpu := range group {
+			indices = append(indices, gpu.Index)
+		}
+		if len(indices) >= 2 {
+			groups = append(groups, nvidiaTaskSelection{
+				GPUIndices: indices,
+				Label:      fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
+			})
+			continue
+		}
+		gpu := group[0]
+		singles = append(singles, nvidiaTaskSelection{
+			GPUIndices: []int{gpu.Index},
+			Label:      fmt.Sprintf("GPU %d — %s", gpu.Index, model),
+		})
+	}
+	return append(groups, singles...), nil
+}
+
+func joinTaskIndices(indices []int) string {
+	parts := make([]string, 0, len(indices))
+	for _, idx := range indices {
+		parts = append(parts, fmt.Sprintf("%d", idx))
+	}
+	return strings.Join(parts, ",")
+}
+
+func formatSplitTaskName(baseName, selectionLabel string) string {
+	baseName = strings.TrimSpace(baseName)
+	selectionLabel = strings.TrimSpace(selectionLabel)
+	if baseName == "" {
+		return selectionLabel
+	}
+	if selectionLabel == "" {
+		return baseName
+	}
+	return baseName + " (" + selectionLabel + ")"
+}
+
+func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
+	if !shouldSplitHomogeneousNvidiaTarget(target) {
+		t := &Task{
+			ID:        newJobID(idPrefix),
+			Name:      baseName,
+			Target:    target,
+			Priority:  priority,
+			Status:    TaskPending,
+			CreatedAt: createdAt,
+			params:    params,
+		}
+		return []*Task{t}, nil
+	}
+	gpus, err := apiListNvidiaGPUs(appRef)
+	if err != nil {
+		return nil, err
+	}
+	selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
+	if err != nil {
+		return nil, err
+	}
+	tasks := make([]*Task, 0, len(selections))
+	for _, selection := range selections {
+		taskParamsCopy := params
+		taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
+		taskParamsCopy.ExcludeGPUIndices = nil
+		displayName := formatSplitTaskName(baseName, selection.Label)
+		taskParamsCopy.DisplayName = displayName
+		tasks = append(tasks, &Task{
+			ID:        newJobID(idPrefix),
+			Name:      displayName,
+			Target:    target,
+			Priority:  priority,
+			Status:    TaskPending,
+			CreatedAt: createdAt,
+			params:    taskParamsCopy,
+		})
+	}
+	return tasks, nil
 }

 // ── SSE helpers ───────────────────────────────────────────────────────────────
@@ -207,28 +433,28 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 		}

 		name := taskDisplayName(target, body.Profile, body.Loader)
-		t := &Task{
-			ID:        newJobID("sat-" + target),
-			Name:      name,
-			Target:    target,
-			Status:    TaskPending,
-			CreatedAt: time.Now(),
-			params: taskParams{
-				Duration:           body.Duration,
-				DiagLevel:          body.DiagLevel,
-				GPUIndices:         body.GPUIndices,
-				ExcludeGPUIndices:  body.ExcludeGPUIndices,
-				Loader:             body.Loader,
-				BurnProfile:        body.Profile,
-				DisplayName:        body.DisplayName,
-				PlatformComponents: body.PlatformComponents,
-			},
-		}
 		if strings.TrimSpace(body.DisplayName) != "" {
-			t.Name = body.DisplayName
+			name = body.DisplayName
 		}
-		globalQueue.enqueue(t)
-		writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+		params := taskParams{
+			Duration:           body.Duration,
+			DiagLevel:          body.DiagLevel,
+			GPUIndices:         body.GPUIndices,
+			ExcludeGPUIndices:  body.ExcludeGPUIndices,
+			Loader:             body.Loader,
+			BurnProfile:        body.Profile,
+			DisplayName:        body.DisplayName,
+			PlatformComponents: body.PlatformComponents,
+		}
+		tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
+		if err != nil {
+			writeError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		for _, t := range tasks {
+			globalQueue.enqueue(t)
+		}
+		writeTaskRunResponse(w, tasks)
 	}
 }

@@ -257,27 +483,26 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 	if body.RunNCCL != nil {
 		runNCCL = *body.RunNCCL
 	}
-	t := &Task{
-		ID:        newJobID("benchmark-nvidia"),
-		Name:      taskDisplayName("nvidia-benchmark", "", ""),
-		Target:    "nvidia-benchmark",
-		Priority:  15,
-		Status:    TaskPending,
-		CreatedAt: time.Now(),
-		params: taskParams{
-			GPUIndices:        body.GPUIndices,
-			ExcludeGPUIndices: body.ExcludeGPUIndices,
-			SizeMB:            body.SizeMB,
-			BenchmarkProfile:  body.Profile,
-			RunNCCL:           runNCCL,
-			DisplayName:       body.DisplayName,
-		},
-	}
+	name := taskDisplayName("nvidia-benchmark", "", "")
 	if strings.TrimSpace(body.DisplayName) != "" {
-		t.Name = body.DisplayName
+		name = body.DisplayName
 	}
-	globalQueue.enqueue(t)
-	writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
+		GPUIndices:        body.GPUIndices,
+		ExcludeGPUIndices: body.ExcludeGPUIndices,
+		SizeMB:            body.SizeMB,
+		BenchmarkProfile:  body.Profile,
+		RunNCCL:           runNCCL,
+		DisplayName:       body.DisplayName,
+	}, name, h.opts.App, "benchmark-nvidia")
+	if err != nil {
+		writeError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	for _, t := range tasks {
+		globalQueue.enqueue(t)
+	}
+	writeTaskRunResponse(w, tasks)
 }

 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -1,6 +1,7 @@
 package webui

 import (
+	"encoding/json"
 	"net/http/httptest"
 	"strings"
 	"testing"
@@ -74,6 +75,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 3, Name: "NVIDIA H100 PCIe"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })

 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
@@ -101,6 +110,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	}
 }

+func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H200 NVL"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkNvidiaRun(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	var resp taskRunResponse
+	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode response: %v", err)
+	}
+	if len(resp.TaskIDs) != 2 {
+		t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
+		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
+	}
+	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
+		t.Fatalf("task[1] gpu indices=%v want [2]", got)
+	}
+}
+
+func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H200 NVL"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
+		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
+	}
+	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
+		t.Fatalf("task[1] gpu indices=%v want [2]", got)
+	}
+}
+
 func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
 	h := &handler{}
 	h.pushFanRings([]platform.FanReading{
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1070,14 +1070,24 @@ func renderValidate(opts HandlerOptions) string {
 		)) +
 		`</div>
 <div style="height:1px;background:var(--border);margin:16px 0"></div>
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">NVIDIA GPU Selection</div>
+  <div class="card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
+    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
+      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
+    </div>
+    <div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+    </div>
+    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
+  </div>
+</div>
+
 <div class="grid3">
-` + renderSATCard("nvidia-selection", "NVIDIA GPU Selection", "", "", renderValidateCardBody(
-		inv.NVIDIA,
-		`Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`,
-		`<code>nvidia-smi --query-gpu=index,name,memory.total</code>`,
-		`<div id="sat-gpu-list"><p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs…</p></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:8px"><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectAllGPUs()">Select all</button><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectNoGPUs()">Clear</button></div><div id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin-top:8px"></div>`,
-	)) +
-		renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
+` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs NVIDIA diagnostics and board inventory checks.`,
 			`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
@@ -1656,6 +1666,12 @@ func renderBenchmark(opts HandlerOptions) string {
 <script>
 let benchmarkES = null;

+function benchmarkTaskIDs(payload) {
+  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
+  if (payload && payload.task_id) return [payload.task_id];
+  return [];
+}
+
 function benchmarkSelectedGPUIndices() {
  return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
    .filter(function(el) { return el.checked && !el.disabled; })
@@ -1755,17 +1771,37 @@ function runNvidiaBenchmark() {
      return payload;
    });
  }).then(function(d) {
-    status.textContent = 'Task ' + d.task_id + ' queued.';
-    term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
-    benchmarkES = new EventSource('/api/tasks/' + d.task_id + '/stream');
-    benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
-    benchmarkES.addEventListener('done', function(e) {
-      benchmarkES.close();
-      benchmarkES = null;
-      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
-      term.scrollTop = term.scrollHeight;
-      status.textContent = e.data ? 'Failed.' : 'Completed.';
-    });
+    const taskIds = benchmarkTaskIDs(d);
+    if (!taskIds.length) throw new Error('No benchmark task was queued.');
+    status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
+    const streamNext = function(idx, failures) {
+      if (idx >= taskIds.length) {
+        status.textContent = failures ? 'Completed with failures.' : 'Completed.';
+        return;
+      }
+      const taskId = taskIds[idx];
+      term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
+      benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
+      benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+      benchmarkES.addEventListener('done', function(e) {
+        benchmarkES.close();
+        benchmarkES = null;
+        if (e.data) failures += 1;
+        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+        term.scrollTop = term.scrollHeight;
+        streamNext(idx + 1, failures);
+      });
+      benchmarkES.onerror = function() {
+        if (benchmarkES) {
+          benchmarkES.close();
+          benchmarkES = null;
+        }
+        term.textContent += '\nERROR: stream disconnected.\n';
+        term.scrollTop = term.scrollHeight;
+        streamNext(idx + 1, failures + 1);
+      };
+    };
+    streamNext(0, 0);
  }).catch(function(err) {
    status.textContent = 'Error.';
    term.textContent += 'ERROR: ' + err.message + '\n';
@@ -1779,13 +1815,24 @@ benchmarkLoadGPUs();

 func renderBenchmarkResultsCard(exportDir string) string {
 	columns, runs := loadBenchmarkHistory(exportDir)
-	if len(runs) == 0 {
-		return `<div class="card"><div class="card-head">Benchmark Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved benchmark runs yet.</p></div></div>`
-	}
+	return renderBenchmarkResultsCardFromRuns(
+		"Benchmark Results",
+		"Composite score by saved benchmark run and GPU.",
+		"No saved benchmark runs yet.",
+		columns,
+		runs,
+	)
+}

+func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, columns []benchmarkHistoryColumn, runs []benchmarkHistoryRun) string {
+	if len(runs) == 0 {
+		return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
+	}
 	var b strings.Builder
-	b.WriteString(`<div class="card"><div class="card-head">Benchmark Results</div><div class="card-body">`)
-	b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">Composite score by saved benchmark run and GPU.</p>`)
+	b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
+	if strings.TrimSpace(description) != "" {
+		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
+	}
 	b.WriteString(`<div style="overflow-x:auto">`)
 	b.WriteString(`<table><thead><tr><th>Test</th><th>Time</th>`)
 	for _, col := range columns {
@@ -1820,7 +1867,10 @@ func loadBenchmarkHistory(exportDir string) ([]benchmarkHistoryColumn, []benchma
 		return nil, nil
 	}
 	sort.Strings(paths)
+	return loadBenchmarkHistoryFromPaths(paths)
+}

+func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
 	columnByKey := make(map[string]benchmarkHistoryColumn)
 	runs := make([]benchmarkHistoryRun, 0, len(paths))
 	for _, path := range paths {
@@ -2005,6 +2055,12 @@ func renderBurn() string {
 <script>
 let biES = null;

+function burnTaskIDs(payload) {
+  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
+  if (payload && payload.task_id) return [payload.task_id];
+  return [];
+}
+
 function burnProfile() {
  const selected = document.querySelector('input[name="burn-profile"]:checked');
  return selected ? selected.value : 'smoke';
@@ -2106,6 +2162,9 @@ function streamTask(taskId, label) {
  });
 }
 function streamBurnTask(taskId, label, resetTerminal) {
+  return streamBurnTaskSet([taskId], label, resetTerminal);
+}
+function streamBurnTaskSet(taskIds, label, resetTerminal) {
  if (biES) { biES.close(); biES = null; }
  document.getElementById('bi-output').style.display = 'block';
  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
@@ -2113,27 +2172,40 @@ function streamBurnTask(taskId, label, resetTerminal) {
  if (resetTerminal) {
    term.textContent = '';
  }
-  term.textContent += 'Task ' + taskId + ' queued. Streaming...\n';
-  return new Promise(function(resolve) {
-    biES = new EventSource('/api/tasks/' + taskId + '/stream');
-    biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
-    biES.addEventListener('done', function(e) {
-      biES.close();
-      biES = null;
-      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
-      term.scrollTop = term.scrollHeight;
-      resolve({ok: !e.data, error: e.data || ''});
-    });
-    biES.onerror = function() {
-      if (biES) {
+  if (!Array.isArray(taskIds) || !taskIds.length) {
+    term.textContent += 'ERROR: no tasks queued.\n';
+    return Promise.resolve({ok:false, error:'no tasks queued'});
+  }
+  const streamNext = function(idx, failures) {
+    if (idx >= taskIds.length) {
+      return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
+    }
+    const taskId = taskIds[idx];
+    term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
+    return new Promise(function(resolve) {
+      biES = new EventSource('/api/tasks/' + taskId + '/stream');
+      biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+      biES.addEventListener('done', function(e) {
        biES.close();
        biES = null;
-      }
-      term.textContent += '\nERROR: stream disconnected.\n';
-      term.scrollTop = term.scrollHeight;
-      resolve({ok: false, error: 'stream disconnected'});
-    };
-  });
+        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+        term.scrollTop = term.scrollHeight;
+        resolve(failures + (e.data ? 1 : 0));
+      });
+      biES.onerror = function() {
+        if (biES) {
+          biES.close();
+          biES = null;
+        }
+        term.textContent += '\nERROR: stream disconnected.\n';
+        term.scrollTop = term.scrollHeight;
+        resolve(failures + 1);
+      };
+    }).then(function(nextFailures) {
+      return streamNext(idx + 1, nextFailures);
+    });
+  };
+  return streamNext(0, 0);
 }

 function runBurnTaskSet(tasks, statusElId) {
@@ -2161,7 +2233,7 @@ function runBurnTaskSet(tasks, statusElId) {
    if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
    return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
      .then(function(d) {
-        return streamBurnTask(d.task_id, t.label, false);
+        return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
      })
      .then(function() {
        return runNext(idx + 1);
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -711,6 +711,8 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 		`controlled NVIDIA DCGM load`,
 		`<code>dcgmi diag targeted_stress</code>`,
 		`NVIDIA GPU Selection`,
+		`All NVIDIA validate tasks use only the GPUs selected here.`,
+		`Select All`,
 		`id="sat-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -230,6 +230,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 	b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
 	b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
 	b.WriteString(`</div></div></div>`)
+	if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
+		b.WriteString(benchmarkCard)
+	}

 	if len(report.Charts) > 0 {
 		for _, chart := range report.Charts {
@@ -247,6 +250,57 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 	return b.String()
 }

+func renderTaskBenchmarkResultsCard(target, logText string) string {
+	if strings.TrimSpace(target) != "nvidia-benchmark" {
+		return ""
+	}
+	resultPath := taskBenchmarkResultPath(logText)
+	if strings.TrimSpace(resultPath) == "" {
+		return ""
+	}
+	columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
+	if len(runs) == 0 {
+		return ""
+	}
+	return renderBenchmarkResultsCardFromRuns(
+		"Benchmark Results",
+		"Composite score for this benchmark task.",
+		"No benchmark results were saved for this task.",
+		columns,
+		runs,
+	)
+}
+
+func taskBenchmarkResultPath(logText string) string {
+	archivePath := taskArchivePathFromLog(logText)
+	if archivePath == "" {
+		return ""
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	if runDir == archivePath {
+		return ""
+	}
+	return filepath.Join(runDir, "result.json")
+}
+
+func taskArchivePathFromLog(logText string) string {
+	lines := strings.Split(logText, "\n")
+	for i := len(lines) - 1; i >= 0; i-- {
+		line := strings.TrimSpace(lines[i])
+		if line == "" || !strings.HasPrefix(line, "Archive:") {
+			continue
+		}
+		path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
+		if strings.HasPrefix(path, "Archive written to ") {
+			path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
+		}
+		if strings.HasSuffix(path, ".tar.gz") {
+			return path
+		}
+	}
+	return ""
+}
+
 func renderTaskStatusBadge(status string) string {
 	className := map[string]string{
 		TaskRunning:   "badge-ok",
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -1149,7 +1149,32 @@ func taskArtifactsDir(root string, t *Task, status string) string {
 	if strings.TrimSpace(root) == "" || t == nil {
 		return ""
 	}
-	return filepath.Join(root, fmt.Sprintf("%s_%s_%s", t.ID, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
+	prefix := taskFolderNumberPrefix(t.ID)
+	return filepath.Join(root, fmt.Sprintf("%s_%s_%s", prefix, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
+}
+
+func taskFolderNumberPrefix(taskID string) string {
+	taskID = strings.TrimSpace(taskID)
+	if strings.HasPrefix(taskID, "TASK-") && len(taskID) >= len("TASK-000") {
+		num := strings.TrimSpace(strings.TrimPrefix(taskID, "TASK-"))
+		if len(num) == 3 {
+			allDigits := true
+			for _, r := range num {
+				if r < '0' || r > '9' {
+					allDigits = false
+					break
+				}
+			}
+			if allDigits {
+				return num
+			}
+		}
+	}
+	fallback := sanitizeTaskFolderPart(taskID)
+	if fallback == "" {
+		return "000"
+	}
+	return fallback
 }

 func ensureTaskReportPaths(t *Task) {
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -163,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
 	}
 }

+func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	origCounter := jobCounter.Load()
+	jobCounter.Store(0)
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+		jobCounter.Store(origCounter)
+	})
+
+	if got := newJobID("ignored"); got != "TASK-000" {
+		t.Fatalf("id=%q want TASK-000", got)
+	}
+	if got := newJobID("ignored"); got != "TASK-001" {
+		t.Fatalf("id=%q want TASK-001", got)
+	}
+}
+
+func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
+	root := t.TempDir()
+	task := &Task{
+		ID:   "TASK-007",
+		Name: "NVIDIA Benchmark",
+	}
+	got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
+	if !strings.HasPrefix(got, "007_") {
+		t.Fatalf("artifacts dir=%q want prefix 007_", got)
+	}
+}
+
 func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
 	dir := t.TempDir()
 	logPath := filepath.Join(dir, "task.log")
@@ -325,6 +359,78 @@ func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
 	}
 }

+func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
+	dir := t.TempDir()
+	metricsPath := filepath.Join(dir, "metrics.db")
+	prevMetricsPath := taskReportMetricsDBPath
+	taskReportMetricsDBPath = metricsPath
+	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
+
+	benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	result := platform.NvidiaBenchmarkResult{
+		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
+		BenchmarkProfile: "standard",
+		OverallStatus:    "OK",
+		GPUs: []platform.BenchmarkGPUResult{
+			{
+				Index: 0,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1176.25,
+				},
+			},
+		},
+	}
+	raw, err := json.Marshal(result)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
+	if err := os.MkdirAll(artifactsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	task := &Task{
+		ID:           "task-bench",
+		Name:         "NVIDIA Benchmark",
+		Target:       "nvidia-benchmark",
+		Status:       TaskDone,
+		CreatedAt:    time.Now().UTC().Add(-time.Minute),
+		ArtifactsDir: artifactsDir,
+	}
+	ensureTaskReportPaths(task)
+	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
+	if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := writeTaskReportArtifacts(task); err != nil {
+		t.Fatalf("writeTaskReportArtifacts: %v", err)
+	}
+
+	body, err := os.ReadFile(task.ReportHTMLPath)
+	if err != nil {
+		t.Fatalf("ReadFile(report.html): %v", err)
+	}
+	html := string(body)
+	for _, needle := range []string{
+		`Benchmark Results`,
+		`Composite score for this benchmark task.`,
+		`NVIDIA H100 PCIe / GPU 0`,
+		`1176.25`,
+	} {
+		if !strings.Contains(html, needle) {
+			t.Fatalf("report missing %q: %s", needle, html)
+		}
+	}
+}
+
 func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
 	var lines []string
 	prev := taskSerialWriteLine
--- a/bible-local/docs/benchmark-clock-calibration.md
+++ b/bible-local/docs/benchmark-clock-calibration.md
@@ -0,0 +1,248 @@
+# Benchmark clock calibration research
+
+## Status
+In progress. Baseline data from production servers pending.
+
+## Background
+
+The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
+before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
+`avg_steady_clock < locked_target * 0.90`.
+
+Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
+even a healthy GPU in a non-ideal server will sustain clocks well below boost.
+The 90% threshold has no empirical basis.
+
+## Key observations (2026-04-06)
+
+### H100 PCIe — new card, server not designed for it
+- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
+- Thermal sustain: 0.0 (sw_thermal covers entire steady window)
+- Stability: 70.0 — clocks erratic, no equilibrium found
+- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
+
+### H200 NVL — new card, server not designed for it
+- avg clock = P95 = 1635 MHz (perfectly stable)
+- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
+- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
+- Degradation: power_capped, thermal_limited
+- Compute: 989 TOPS — card is computing correctly for its frequency
+
+### Key insight
+The meaningful distinction is not *whether* the card throttles but *how stably*
+it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
+H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
+instability may reflect a more severe thermal mismatch or a card issue.
+
+`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
+`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
+
+## Hypothesis for baseline
+
+After testing on servers designed for their GPUs (proper cooling):
+- Healthy GPU under sustained load will run at a stable fraction of boost
+- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
+- Base clock (`clocks.base.gr`) may be a better reference than boost:
+  a healthy card under real workload should comfortably exceed base clock
+
+## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
+
+Source: external stress test tool, ~90s runs, designed server, adequate power.
+
+### Healthy fingerprint
+
+- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
+- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
+  - Avg steady (visual): **~1580–1620 MHz**
+  - vs boost 1755 MHz: **~91–92%**
+  - Oscillation is NORMAL — this is the boost algorithm balancing under power cap
+  - Stable power + oscillating clocks = healthy power-cap behavior
+- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
+- **Consistency**: all 10 samples within ±20 MHz — very repeatable
+
+### Characteristic patten
+Flat power line + oscillating/declining clock line = GPU correctly managed by
+power cap algorithm. Do NOT flag this as instability.
+
+### Clock CV implication
+The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
+The current `variance_too_high` threshold (StabilityScore < 85) may fire on
+healthy HBM2e PCIe cards. Needs recalibration.
+
+---
+
+## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
+
+Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
+Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
+
+### GPU clock reference (from nvidia-smi, idle):
+- base_clock_mhz: **1095**
+- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
+- achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
+- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
+
+### Observed under 700W sustained load (both samples nearly identical):
+- Power: ~700W flat — SXM slot, adequate power confirmed
+- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
+- vs 1980 MHz (lock target): **72–74%** — severely below
+- vs 1755 MHz (nvidia-smi boost): **81–83%**
+- vs 1095 MHz (base): 130% — above base but far below expected for SXM
+- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
+- Temperature: 38°C → 79–80°C (same rate as HBM2e)
+- Oscillation: present, similar character to HBM2e but at much lower frequency
+
+### Diagnosis
+These restored cards are degraded. A healthy H100 SXM in a designed server
+(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
+The 72–74% result is a clear signal of silicon or VRM degradation from the
+refurbishment process.
+
+### Clock pattern note
+Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
+to images 19/20. Both sample sets show same degraded pattern — same batch.
+
+---
+
+## Baseline matrix (filled where data available)
+
+| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
+|---|---|---|---|---|---|
+| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
+| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
+| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
+| H200 NVL | designed | TBD | TBD | TBD | need baseline |
+
+---
+
+## H100 official spec (from NVIDIA datasheet)
+
+Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
+All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
+
+| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
+|---|---|---|---|---|---|
+| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
+| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
+| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
+| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
+
+Notes:
+- SXM boards do NOT list FP8 peak in this table (field empty)
+- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
+- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
+
+## Observed efficiency (H100 80GB PCIe, throttled server)
+
+From the report in this session (power+thermal throttle throughout steady):
+
+| Precision | Measured | Spec (dense) | % of spec |
+|---|---|---|---|
+| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
+| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
+| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
+
+33–44% of spec is expected given sustained power+thermal throttle (avg clock
+1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
+actual frequency — the low TOPS comes from throttle, not silicon defect.
+
+## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
+
+Format: without sparsity / with sparsity.
+
+| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
+|---|---|---|---|---|---|
+| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
+| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
+
+## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
+
+Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
+
+| Precision | Measured | Spec (dense) | % of spec |
+|---|---|---|---|
+| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
+| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
+| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
+
+Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
+both are throttle-limited. Confirms that % of spec is not a quality signal,
+it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
+
+## Real-world GEMM efficiency reference (2026-04-06, web research)
+
+Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
+worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
+
+### What healthy systems actually achieve:
+- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
+- cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
+- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
+
+### Our results vs expectation:
+| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
+|---|---|---|---|---|
+| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
+| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
+
+Our results are roughly **half** of what a healthy system achieves even under throttle.
+This is NOT normal — 30-44% is not the industry baseline.
+
+### Likely causes of the gap (in order of probability):
+1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
+2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
+   Previous user may have set a lower limit via nvidia-smi -pl and it was not
+   reset. Our normalization sets clock locks but does NOT reset power limit.
+   Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
+3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
+   8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
+
+### Power limit gap analysis (H100 PCIe):
+- Avg clock 1384 MHz = 79% of boost 1755 MHz
+- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
+- Actually measured: 329 TOPS = 55% of that estimate
+- Remaining gap after accounting for clock throttle: ~45%
+- Most likely explanation: enforced power limit < 350W TDP, further reducing
+  sustainable clock beyond what sw_thermal alone would cause.
+
+### Action item:
+Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
+so result.json shows if the card was pre-configured with a non-default limit.
+If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
+
+### CPU/RAM impact on GPU FLOPS:
+None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
+CPU core count and host RAM are irrelevant.
+
+## Compute efficiency metric (proposed, no hardcode)
+
+Instead of comparing TOPS to a hardcoded spec, compute:
+  tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
+
+This is model-agnostic. A GPU computing correctly at its actual frequency
+will show a consistent tops_per_sm_per_ghz regardless of throttle level.
+A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
+normal clocks.
+
+SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
+(needs to be added to queryBenchmarkGPUInfo).
+
+Reference values to establish after baseline runs:
+- H100 PCIe fp16_tensor: TBD tops/SM/GHz
+- H100 SXM fp16_tensor: TBD tops/SM/GHz
+
+## Proposed threshold changes (pending more data)
+
+1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
+   91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
+   capture the root cause.
+
+2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
+   under power cap. Consider suppressing this flag when power is flat and usage
+   is 100% (oscillation is expected). Or lower threshold to 70.
+
+3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
+   ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
+   would have been caught by this).
+
+Decision deferred until baseline on SXM designed servers collected.
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -606,6 +606,20 @@ struct prepared_profile {
 };

 static const struct profile_desc k_profiles[] = {
+    {
+        "fp64",
+        "fp64",
+        80,
+        1,
+        0,
+        0,
+        8,
+        CUDA_R_64F,
+        CUDA_R_64F,
+        CUDA_R_64F,
+        CUDA_R_64F,
+        CUBLAS_COMPUTE_64F,
+    },
    {
        "fp32_tf32",
        "fp32",
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
            ;;
        *)
            echo "unknown arg: $1" >&2
-            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
+            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
            exit 1
            ;;
    esac
 done

 case "$VARIANT" in
-    nvidia|amd|nogpu|all) ;;
-    *) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
+    nvidia|nvidia-legacy|amd|nogpu|all) ;;
+    *) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
 esac

 if [ "$CLEAN_CACHE" = "1" ]; then
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
           "${CACHE_DIR:?}/lb-packages"
    echo "=== cleaning live-build work dirs ==="
    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
+    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
    rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
    rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
    echo "=== caches cleared, proceeding with build ==="
 fi

@@ -180,6 +185,9 @@ case "$VARIANT" in
    nvidia)
        run_variant nvidia
        ;;
+    nvidia-legacy)
+        run_variant nvidia-legacy
+        ;;
    amd)
        run_variant amd
        ;;
@@ -188,6 +196,7 @@ case "$VARIANT" in
        ;;
    all)
        run_variant nvidia
+        run_variant nvidia-legacy
        run_variant amd
        run_variant nogpu
        ;;
--- a/iso/builder/build-nvidia-module.sh
+++ b/iso/builder/build-nvidia-module.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
-# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
+# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
 #
 # Downloads the official NVIDIA .run installer, extracts kernel modules and
-# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
+# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
+#   - open         -> kernel-open/ sources from the .run installer
+#   - proprietary  -> traditional proprietary kernel sources from the .run installer
 #
 # Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
 # are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
@@ -17,10 +19,19 @@ set -e
 NVIDIA_VERSION="$1"
 DIST_DIR="$2"
 DEBIAN_KERNEL_ABI="$3"
+NVIDIA_FLAVOR="${4:-open}"

-[ -n "$NVIDIA_VERSION" ]    || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
-[ -n "$DIST_DIR" ]          || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
-[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
+[ -n "$NVIDIA_VERSION" ]    || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
+[ -n "$DIST_DIR" ]          || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
+[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
+
+case "$NVIDIA_FLAVOR" in
+    open|proprietary) ;;
+    *)
+        echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
+        exit 1
+        ;;
+esac

 KVER="${DEBIAN_KERNEL_ABI}-amd64"
 # On Debian, kernel headers are split into two packages:
@@ -31,22 +42,13 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
 KDIR_ARCH="/usr/src/linux-headers-${KVER}"
 KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"

-echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
+echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="

-if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
-    echo "=== installing linux-headers-${KVER} ==="
-    DEBIAN_FRONTEND=noninteractive apt-get install -y \
-        "linux-headers-${KVER}" \
-        gcc make perl
-fi
-echo "kernel headers (arch):   $KDIR_ARCH"
-echo "kernel headers (common): $KDIR_COMMON"
-
-CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
+CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
 EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
-CACHE_LAYOUT_VERSION="2"
+CACHE_LAYOUT_VERSION="3"
 CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
 if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
        && [ -f "$CACHE_LAYOUT_MARKER" ] \
@@ -57,6 +59,15 @@ if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
    exit 0
 fi

+if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
+    echo "=== installing linux-headers-${KVER} ==="
+    DEBIAN_FRONTEND=noninteractive apt-get install -y \
+        "linux-headers-${KVER}" \
+        gcc make perl
+fi
+echo "kernel headers (arch):   $KDIR_ARCH"
+echo "kernel headers (common): $KDIR_COMMON"
+
 # Download official NVIDIA .run installer with sha256 verification
 BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
 mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
@@ -90,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
 rm -rf "$EXTRACT_DIR"
 "$RUN_FILE" --extract-only --target "$EXTRACT_DIR"

-# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
+# Find kernel source directory for the selected flavor.
 KERNEL_SRC=""
-for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
-    [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
-done
-[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
+if [ "$NVIDIA_FLAVOR" = "open" ]; then
+    for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
+        [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
+    done
+else
+    for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
+        [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
+    done
+fi
+[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
 echo "kernel source: $KERNEL_SRC"

 # Build kernel modules
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -15,26 +15,46 @@ DIST_DIR="${REPO_ROOT}/dist"
 VENDOR_DIR="${REPO_ROOT}/iso/vendor"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 AUTH_KEYS=""
+BUILD_VARIANT="nvidia"
 BEE_GPU_VENDOR="nvidia"
+BEE_NVIDIA_MODULE_FLAVOR="open"

 # parse args
 while [ $# -gt 0 ]; do
    case "$1" in
        --authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
-        --variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
+        --variant) BUILD_VARIANT="$2"; shift 2 ;;
        *) echo "unknown arg: $1"; exit 1 ;;
    esac
 done

-case "$BEE_GPU_VENDOR" in
-    nvidia|amd|nogpu) ;;
-    *) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
+case "$BUILD_VARIANT" in
+    nvidia)
+        BEE_GPU_VENDOR="nvidia"
+        BEE_NVIDIA_MODULE_FLAVOR="open"
+        ;;
+    nvidia-legacy)
+        BEE_GPU_VENDOR="nvidia"
+        BEE_NVIDIA_MODULE_FLAVOR="proprietary"
+        ;;
+    amd)
+        BEE_GPU_VENDOR="amd"
+        BEE_NVIDIA_MODULE_FLAVOR=""
+        ;;
+    nogpu)
+        BEE_GPU_VENDOR="nogpu"
+        BEE_NVIDIA_MODULE_FLAVOR=""
+        ;;
+    *)
+        echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
+        exit 1
+        ;;
 esac

-BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
-OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
+BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
+OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"

-export BEE_GPU_VENDOR
+export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT

 . "${BUILDER_DIR}/VERSIONS"
 export PATH="$PATH:/usr/local/go/bin"
@@ -627,7 +647,7 @@ recover_iso_memtest() {

 AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
 ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
-ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
+ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
 # Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
 OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
 mkdir -p "${OUT_DIR}"
@@ -801,7 +821,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
    apt-get install -y "linux-headers-${KVER}"
 fi

-echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
+echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
 echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
 echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
 echo ""
@@ -871,7 +891,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    fi
 fi

-echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
+echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"

 # Sync builder config into variant work dir, preserving lb cache.
@@ -897,6 +917,86 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
    rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
 fi

+if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
+    cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
+source /boot/grub/config.cfg
+
+echo ""
+echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
+echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
+echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
+echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
+echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
+echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
+echo "  Hardware Audit LiveCD"
+echo ""
+
+menuentry "EASY-BEE" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+    initrd  @INITRD_LIVE@
+}
+
+submenu "EASY-BEE (advanced options) -->" {
+    menuentry "EASY-BEE — KMS (no nomodeset)" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }
+
+    menuentry "EASY-BEE — fail-safe" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
+        initrd  @INITRD_LIVE@
+    }
+}
+
+if [ "${grub_platform}" = "efi" ]; then
+    menuentry "Memory Test (memtest86+)" {
+        chainloader /boot/memtest86+x64.efi
+    }
+else
+    menuentry "Memory Test (memtest86+)" {
+        linux16 /boot/memtest86+x64.bin
+    }
+fi
+
+if [ "${grub_platform}" = "efi" ]; then
+    menuentry "UEFI Firmware Settings" {
+        fwsetup
+    }
+fi
+EOF
+
+    cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
+label live-@FLAVOUR@-normal
+    menu label ^EASY-BEE
+    menu default
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@
+
+label live-@FLAVOUR@-kms
+    menu label EASY-BEE (^graphics/KMS)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ bee.display=kms
+
+label live-@FLAVOUR@-toram
+    menu label EASY-BEE (^load to RAM)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ toram
+
+label live-@FLAVOUR@-failsafe
+    menu label EASY-BEE (^fail-safe)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+
+label memtest
+    menu label ^Memory Test (memtest86+)
+    linux /boot/memtest86+x64.bin
+EOF
+fi
+
 rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
 rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
@@ -981,10 +1081,10 @@ done
 # --- NVIDIA kernel modules and userspace libs ---
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
-        sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
+        sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"

    KVER="${DEBIAN_KERNEL_ABI}-amd64"
-    NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
+    NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"

    # Inject .ko files into overlay at /usr/local/lib/nvidia/
    OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
@@ -1055,13 +1155,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u

 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
+NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
 NCCL_VERSION=${NCCL_VERSION}
 NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
 CUBLAS_VERSION=${CUBLAS_VERSION}
 CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
 NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
 JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
-    GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
+    GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
    GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
    GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
@@ -1073,6 +1174,7 @@ fi
 cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
 BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
 BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
+BEE_BUILD_VARIANT=${BUILD_VARIANT}
 BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
 BUILD_DATE=${BUILD_DATE}
 GIT_COMMIT=${GIT_COMMIT}
@@ -1083,6 +1185,11 @@ EOF

 # Write GPU vendor marker for hooks
 echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
+if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
+    echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
+else
+    rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
+fi

 # Patch motd with build info
 BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
@@ -1153,10 +1260,10 @@ fi

 # --- build ISO using live-build ---
 echo ""
-echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
+echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="

 # Export for auto/config
-BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
+BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
 export BEE_GPU_VENDOR_UPPER

 cd "${LB_DIR}"
@@ -1191,7 +1298,7 @@ if [ -f "$ISO_RAW" ]; then
    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
-    echo "=== done (${BEE_GPU_VENDOR}) ==="
+    echo "=== done (${BUILD_VARIANT}) ==="
    echo "ISO: $ISO_OUT"
    if command -v stat >/dev/null 2>&1; then
        ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
--- a/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
+++ b/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
@@ -0,0 +1,41 @@
+#!/bin/sh
+# 9010-fix-toram.hook.chroot — patch live-boot toram to work with tmpfs (no O_DIRECT)
+#
+# live-boot tries "losetup --replace --direct-io=on" when re-associating the
+# loop device to the RAM copy in /dev/shm.  tmpfs does not support O_DIRECT,
+# so the ioctl returns EINVAL and the verification step fails.
+#
+# The patch replaces the replace call so that if --direct-io=on fails it falls
+# back to a plain replace without direct-io, and also relaxes the verification
+# to a warning so the boot continues even when re-association is imperfect.
+set -e
+
+TORAM_SCRIPT="/usr/lib/live/boot/9990-toram-todisk.sh"
+
+if [ ! -f "${TORAM_SCRIPT}" ]; then
+    echo "9010-fix-toram: ${TORAM_SCRIPT} not found, skipping"
+    exit 0
+fi
+
+echo "9010-fix-toram: patching ${TORAM_SCRIPT}"
+
+# Replace any losetup --replace call that includes --direct-io=on with a
+# version that first tries with direct-io, then retries without it.
+#
+# The sed expression turns:
+#   losetup --replace ... --direct-io=on LOOP FILE
+# into a shell snippet that tries both, silently.
+#
+# We also downgrade the fatal "Task finished with error." block to a warning
+# so the boot continues if re-association fails (squashfs still accessible).
+
+# 1. Strip --direct-io=on from the losetup --replace call so it works on tmpfs.
+sed -i 's/losetup --replace --direct-io=on/losetup --replace/g' "${TORAM_SCRIPT}"
+sed -i 's/losetup --replace --direct-io/losetup --replace/g' "${TORAM_SCRIPT}"
+
+# 2. Turn the hard error into a warning so boot continues.
+#    live-boot prints this exact string when verification fails.
+sed -i 's/echo "Task finished with error\."/echo "Warning: toram re-association failed, continuing boot (squashfs still in RAM)"/' "${TORAM_SCRIPT}"
+
+echo "9010-fix-toram: patch applied"
+grep -n "losetup" "${TORAM_SCRIPT}" | head -20 || true
--- a/iso/builder/smoketest.sh
+++ b/iso/builder/smoketest.sh
@@ -27,6 +27,7 @@ echo ""
 KVER=$(uname -r)
 info "kernel: $KVER"
 NVIDIA_BOOT_MODE="normal"
+NVIDIA_MODULES_FLAVOR="proprietary"
 for arg in $(cat /proc/cmdline 2>/dev/null); do
    case "$arg" in
        bee.nvidia.mode=*)
@@ -34,7 +35,11 @@ for arg in $(cat /proc/cmdline 2>/dev/null); do
            ;;
    esac
 done
+if [ -f /etc/bee-nvidia-modules-flavor ]; then
+    NVIDIA_MODULES_FLAVOR="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null || echo proprietary)"
+fi
 info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
+info "nvidia modules flavor: ${NVIDIA_MODULES_FLAVOR}"

 # --- PATH & binaries ---
 echo "-- PATH & binaries --"
@@ -110,10 +115,12 @@ fi
 for mod in nvidia_modeset nvidia_uvm; do
    if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
        ok "module loaded: $mod"
-    elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
+    elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
        fail "module NOT loaded in normal mode: $mod"
-    else
+    elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
        warn "module not loaded in GSP-off mode: $mod"
+    else
+        fail "module NOT loaded: $mod"
    fi
 done

@@ -129,10 +136,12 @@ done

 if [ -e /dev/nvidia-uvm ]; then
    ok "/dev/nvidia-uvm exists"
-elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
+elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
    fail "/dev/nvidia-uvm missing in normal mode"
-else
+elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
    warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
+else
+    fail "/dev/nvidia-uvm missing"
 fi

 echo ""
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -62,6 +62,8 @@ done
 echo "loader=bee-gpu-burn"
 echo "selected_gpus=${FINAL}"

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+
 TMP_DIR=$(mktemp -d)
 trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM

@@ -78,7 +80,8 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
        fi
    fi
    echo "starting gpu ${id} size=${gpu_size_mb}MB"
-    "${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
+    CUDA_VISIBLE_DEVICES="${id}" \
+        "${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
 done
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -152,14 +152,19 @@ done

 [ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+export CUDA_VISIBLE_DEVICES="${FINAL}"
+
 JOHN_DEVICES=""
+local_id=1
 for id in $(echo "${FINAL}" | tr ',' ' '); do
-    opencl_id=$((id + 1))
+    opencl_id="${local_id}"
    if [ -z "${JOHN_DEVICES}" ]; then
        JOHN_DEVICES="${opencl_id}"
    else
        JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
    fi
+    local_id=$((local_id + 1))
 done

 echo "loader=john"
--- a/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
@@ -70,6 +70,8 @@ echo "gpu_count=${GPU_COUNT}"
 echo "range=${MIN_BYTES}..${MAX_BYTES}"
 echo "iters=${ITERS}"

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+
 deadline=$(( $(date +%s) + SECONDS ))
 round=0

--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -6,6 +6,19 @@ NVIDIA_KO_DIR="/usr/local/lib/nvidia"

 log() { echo "[bee-nvidia] $*"; }

+read_nvidia_modules_flavor() {
+    if [ -f /etc/bee-nvidia-modules-flavor ]; then
+        flavor="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null)"
+        case "$flavor" in
+            open|proprietary)
+                echo "$flavor"
+                return 0
+                ;;
+        esac
+    fi
+    echo "proprietary"
+}
+
 log "kernel: $(uname -r)"

 # Skip if no NVIDIA GPU present (PCI vendor 10de)
@@ -40,6 +53,8 @@ if [ -z "$nvidia_mode" ]; then
    nvidia_mode="normal"
 fi
 log "boot mode: $nvidia_mode"
+nvidia_modules_flavor="$(read_nvidia_modules_flavor)"
+log "modules flavor: $nvidia_modules_flavor"

 load_module() {
    mod="$1"
@@ -150,37 +165,54 @@ load_host_module() {
    return 1
 }

-case "$nvidia_mode" in
-    normal|full)
-        if ! load_module_with_gsp_fallback; then
-            exit 1
-        fi
-        # nvidia-modeset on some server kernels needs ACPI video helper symbols
-        # exported by the generic "video" module. Best-effort only; compute paths
-        # remain functional even if display-related modules stay absent.
-        load_host_module video || true
-        load_module nvidia-modeset || true
-        load_module nvidia-uvm || true
-        ;;
-    gsp-off|safe)
-        # NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
-        # be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
-        # conservative path for platforms where full boot-time GSP init is unstable.
-        if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
-            exit 1
-        fi
-        log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
-        ;;
-    nomsi|*)
-        # nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
-        # "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
-        # NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
-        if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
-            exit 1
-        fi
-        log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
-        ;;
-esac
+if [ "$nvidia_modules_flavor" = "open" ]; then
+    case "$nvidia_mode" in
+        gsp-off|safe|nomsi)
+            log "ignoring boot mode ${nvidia_mode} for open NVIDIA modules"
+            ;;
+    esac
+    if ! load_module nvidia; then
+        exit 1
+    fi
+    # nvidia-modeset on some server kernels needs ACPI video helper symbols
+    # exported by the generic "video" module. Best-effort only; compute paths
+    # remain functional even if display-related modules stay absent.
+    load_host_module video || true
+    load_module nvidia-modeset || true
+    load_module nvidia-uvm || true
+else
+    case "$nvidia_mode" in
+        normal|full)
+            if ! load_module_with_gsp_fallback; then
+                exit 1
+            fi
+            # nvidia-modeset on some server kernels needs ACPI video helper symbols
+            # exported by the generic "video" module. Best-effort only; compute paths
+            # remain functional even if display-related modules stay absent.
+            load_host_module video || true
+            load_module nvidia-modeset || true
+            load_module nvidia-uvm || true
+            ;;
+        gsp-off|safe)
+            # NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
+            # be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
+            # conservative path for platforms where full boot-time GSP init is unstable.
+            if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
+                exit 1
+            fi
+            log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
+            ;;
+        nomsi|*)
+            # nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
+            # "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
+            # NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
+            if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
+                exit 1
+            fi
+            log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
+            ;;
+    esac
+fi

 # Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
 nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
Author	SHA1	Message	Date
Michael Chus	8fe20ba678	Fix benchmark scoring: PowerSustain uses default power limit PowerSustainScore now uses DefaultPowerLimitW as reference so a manually reduced power limit does not inflate the score. Falls back to enforced limit if default is unavailable. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 22:30:59 +03:00
Michael Chus	d973231f37	Enhance benchmark: server power via IPMI, efficiency metrics, FP64, power limit check - Sample server power (IPMI dcmi) during baseline+steady phases in parallel; compute delta vs GPU-reported sum; flag ratio < 0.75 as unreliable reporting - Collect base_graphics_clock_mhz, multiprocessor_count, default_power_limit_w from nvidia-smi alongside existing GPU info - Add tops_per_sm_per_ghz efficiency metric (model-agnostic silicon quality signal) - Flag when enforced power limit is below default TDP by >5% - Add fp64 profile to bee-gpu-burn worker (CUDA_R_64F, CUBLAS_COMPUTE_64F, min cc 8.0) - Improve Executive Summary: overall pass count, FAILED GPU finding - Throttle counters now shown as % of steady window instead of raw microseconds - bible-local: clock calibration research, H100/H200 spec, real-world GEMM baselines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 22:26:52 +03:00
Michael Chus	f5d175f488	Fix toram: patch live-boot to not use O_DIRECT when replacing loop to tmpfs losetup --replace --direct-io=on fails with EINVAL when the target file is on tmpfs (/dev/shm), because tmpfs does not support O_DIRECT. Strip the --direct-io flag from the replace call and downgrade the verification failure to a warning so boot continues. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 21:06:21 +03:00
Michael Chus	fa00667750	Refactor NVIDIA GPU Selection into standalone card on validate page Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 21:06:16 +03:00
Mikhail Chusavitin	c7d2816a7f	Limit NVIDIA legacy boot hooks to proprietary ISO	2026-04-06 16:33:16 +03:00
Mikhail Chusavitin	d2eadedff2	Default NVIDIA ISO to open modules and add nvidia-legacy	2026-04-06 16:27:13 +03:00
Mikhail Chusavitin	a98c4d7461	Include terminal charts in benchmark report	2026-04-06 12:34:57 +03:00
Mikhail Chusavitin	2354ae367d	Normalize task IDs and artifact folder prefixes	2026-04-06 12:26:47 +03:00
Mikhail Chusavitin	0d0e1f55a7	Avoid misleading SAT summaries after task cancellation	2026-04-06 12:24:19 +03:00
Mikhail Chusavitin	35f4c53887	Stabilize NVIDIA GPU device mapping across loaders	2026-04-06 12:22:04 +03:00
Mikhail Chusavitin	981315e6fd	Split NVIDIA tasks by homogeneous GPU groups	2026-04-06 11:58:13 +03:00