fix(grub): fix bitmap error and menu rendering

- Convert bee-logo.png to RGBA (color type 6) and strip all metadata chunks (cHRM, bKGD, tIME, tEXt) that confuse GRUB's minimal PNG parser - Move terminal_output gfxterm before insmod png / theme load so the theme initialises in an active gfxterm context - Remove echo ASCII art banner from grub.cfg — with gfxterm active and no terminal_box in the theme, echo output renders over the menu area - Fix icon_heigh typo → icon_height; increase item_height 16→20 with item_padding 0→2 for reliable text rendering in boot_menu Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
update submodule
2026-04-22 22:05:16 +03:00 · 2026-04-22 20:39:27 +03:00 · 2026-04-22 19:01:50 +03:00 · 2026-04-22 19:00:04 +03:00 · 2026-04-22 18:52:10 +03:00 · 2026-04-22 18:52:04 +03:00
10 changed files with 459 additions and 235 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 dist/
 iso/out/
 build-cache/
 audit/bee
--- a/audit/bee
+++ b/audit/bee
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -67,6 +67,13 @@ type benchmarkPowerCalibrationResult struct {
 	MetricRows []GPUMetricRow
 }
 type benchmarkPowerCalibrationRunSummary struct {
 	LoadedSDR          benchmarkSDRSeriesSummary
 	AvgFanRPM          float64
 	AvgFanDutyCyclePct float64
 	FanSamples         int
 }
 type benchmarkBurnProfile struct {
 	name       string
 	category   string
@@ -2413,6 +2420,16 @@ type sdrPowerSnapshot struct {
 	SkippedSensors []string // sensors rejected during self-healing
 }
 type benchmarkSDRSeriesSummary struct {
 	PSUInW   float64
 	PSUOutW  float64
 	GPUSlotW float64
 	PSUSlots map[string]BenchmarkPSUSlotPower
 	Samples  int
 	SkippedSensors []string
 }
 // sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
 type sdrSensor struct {
 	name  string
@@ -2542,6 +2559,137 @@ func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
 	return snap
 }
 func startIPMISDRSampler(stopCh <-chan struct{}, intervalSec int) <-chan []sdrPowerSnapshot {
 	if intervalSec <= 0 {
 		intervalSec = benchmarkPowerAutotuneSampleInterval
 	}
 	ch := make(chan []sdrPowerSnapshot, 1)
 	go func() {
 		defer close(ch)
 		var samples []sdrPowerSnapshot
 		record := func() {
 			snap := sampleIPMISDRPowerSensors()
 			if snap.PSUInW <= 0 && snap.PSUOutW <= 0 && snap.GPUSlotW <= 0 && len(snap.PSUSlots) == 0 {
 				return
 			}
 			samples = append(samples, snap)
 		}
 		record()
 		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
 		defer ticker.Stop()
 		for {
 			select {
 			case <-stopCh:
 				ch <- samples
 				return
 			case <-ticker.C:
 				record()
 			}
 		}
 	}()
 	return ch
 }
 func summarizeSDRPowerSeries(samples []sdrPowerSnapshot) benchmarkSDRSeriesSummary {
 	var summary benchmarkSDRSeriesSummary
 	if len(samples) == 0 {
 		return summary
 	}
 	type slotAggregate struct {
 		inputs  []float64
 		outputs []float64
 		status  string
 	}
 	slotAgg := make(map[string]*slotAggregate)
 	skippedSet := make(map[string]struct{})
 	var inputTotals []float64
 	var outputTotals []float64
 	var gpuSlotTotals []float64
 	for _, sample := range samples {
 		if sample.PSUInW > 0 {
 			inputTotals = append(inputTotals, sample.PSUInW)
 		}
 		if sample.PSUOutW > 0 {
 			outputTotals = append(outputTotals, sample.PSUOutW)
 		}
 		if sample.GPUSlotW > 0 {
 			gpuSlotTotals = append(gpuSlotTotals, sample.GPUSlotW)
 		}
 		for _, skipped := range sample.SkippedSensors {
 			if skipped != "" {
 				skippedSet[skipped] = struct{}{}
 			}
 		}
 		for slot, reading := range sample.PSUSlots {
 			agg := slotAgg[slot]
 			if agg == nil {
 				agg = &slotAggregate{}
 				slotAgg[slot] = agg
 			}
 			if reading.InputW != nil && *reading.InputW > 0 {
 				agg.inputs = append(agg.inputs, *reading.InputW)
 			}
 			if reading.OutputW != nil && *reading.OutputW > 0 {
 				agg.outputs = append(agg.outputs, *reading.OutputW)
 			}
 			switch {
 			case reading.Status == "":
 			case agg.status == "":
 				agg.status = reading.Status
 			case agg.status == "OK" && reading.Status != "OK":
 				agg.status = reading.Status
 			}
 		}
 	}
 	summary.PSUInW = benchmarkMean(inputTotals)
 	summary.PSUOutW = benchmarkMean(outputTotals)
 	summary.GPUSlotW = benchmarkMean(gpuSlotTotals)
 	summary.Samples = len(samples)
 	if len(slotAgg) > 0 {
 		summary.PSUSlots = make(map[string]BenchmarkPSUSlotPower, len(slotAgg))
 		for slot, agg := range slotAgg {
 			reading := BenchmarkPSUSlotPower{Status: agg.status}
 			if mean := benchmarkMean(agg.inputs); mean > 0 {
 				v := mean
 				reading.InputW = &v
 			}
 			if mean := benchmarkMean(agg.outputs); mean > 0 {
 				v := mean
 				reading.OutputW = &v
 			}
 			summary.PSUSlots[slot] = reading
 		}
 	}
 	if len(skippedSet) > 0 {
 		summary.SkippedSensors = make([]string, 0, len(skippedSet))
 		for skipped := range skippedSet {
 			summary.SkippedSensors = append(summary.SkippedSensors, skipped)
 		}
 		sort.Strings(summary.SkippedSensors)
 	}
 	return summary
 }
 func collectIPMISDRPowerSeries(ctx context.Context, durationSec, intervalSec int) benchmarkSDRSeriesSummary {
 	if durationSec <= 0 {
 		return benchmarkSDRSeriesSummary{}
 	}
 	stopCh := make(chan struct{})
 	doneCh := startIPMISDRSampler(stopCh, intervalSec)
 	select {
 	case <-ctx.Done():
 	case <-time.After(time.Duration(durationSec) * time.Second):
 	}
 	close(stopCh)
 	return summarizeSDRPowerSeries(<-doneCh)
 }
 // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
 // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
 func queryIPMIServerPowerW() (float64, error) {
@@ -3086,8 +3234,9 @@ func runBenchmarkPowerCalibration(
 	logFunc func(string),
 	seedLimits map[int]int,
 	durationSec int,
-) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
+) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow, benchmarkPowerCalibrationRunSummary) {
 	calibDurationSec := durationSec
 	var runSummary benchmarkPowerCalibrationRunSummary
 	if calibDurationSec <= 0 {
 		calibDurationSec = 120
 	}
@@ -3105,12 +3254,12 @@ func runBenchmarkPowerCalibration(
 	if engine == BenchmarkPowerEngineTargetedPower {
 		if _, err := exec.LookPath("dcgmi"); err != nil {
 			logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
-			return map[int]benchmarkPowerCalibrationResult{}, nil, nil
+			return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
 		}
 	} else {
 		if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
 			logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
-			return map[int]benchmarkPowerCalibrationResult{}, nil, nil
+			return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
 		}
 	}
 	if killed := KillTestWorkers(); len(killed) > 0 {
@@ -3275,6 +3424,10 @@ calibDone:
 		}
 		attemptCtx, cancelAttempt := context.WithCancel(ctx)
 		doneCh := make(chan sharedAttemptResult, 1)
 		sdrStopCh := make(chan struct{})
 		sdrDoneCh := startIPMISDRSampler(sdrStopCh, benchmarkPowerAutotuneSampleInterval)
 		fanStopCh := make(chan struct{})
 		fanDoneCh := startBenchmarkFanSampler(fanStopCh, benchmarkPowerAutotuneSampleInterval)
 		go func() {
 			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
 			doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
@@ -3314,6 +3467,10 @@ calibDone:
 		}
 		ticker.Stop()
 		cancelAttempt()
 		close(sdrStopCh)
 		close(fanStopCh)
 		attemptSDRSummary := summarizeSDRPowerSeries(<-sdrDoneCh)
 		attemptFanSummary := <-fanDoneCh
 		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
 		// Accumulate telemetry rows with attempt stage label.
 		appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
@@ -3351,10 +3508,14 @@ calibDone:
 		busyDelaySec = 1
 		// Per-GPU analysis and binary search update.
 		attemptStable := ar.err == nil
 		for _, s := range active {
 			perGPU := filterRowsByGPU(ar.rows, s.idx)
 			summary := summarizeBenchmarkTelemetry(perGPU)
 			throttle := throttleReasons[s.idx]
 			if throttle != "" || summary.P95PowerW <= 0 {
 				attemptStable = false
 			}
 			// Cooling warning: thermal throttle with fans not at maximum.
 			if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
@@ -3487,6 +3648,16 @@ calibDone:
 			s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
 			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
 		}
 		if attemptStable {
 			if attemptSDRSummary.Samples > 0 {
 				runSummary.LoadedSDR = attemptSDRSummary
 			}
 			if attemptFanSummary.FanSamples > 0 {
 				runSummary.AvgFanRPM = attemptFanSummary.AvgFanRPM
 				runSummary.AvgFanDutyCyclePct = attemptFanSummary.AvgFanDutyCyclePct
 				runSummary.FanSamples = attemptFanSummary.FanSamples
 			}
 		}
 	}
 	for _, s := range states {
@@ -3495,7 +3666,7 @@ calibDone:
 		}
 	}
 	writeBenchmarkMetricsFiles(runDir, allCalibRows)
-	return results, restore, allCalibRows
+	return results, restore, allCalibRows, runSummary
 }
 // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3540,6 +3711,47 @@ func meanFanRPM(fans []FanReading) float64 {
 	return sum / float64(len(fans))
 }
 func startBenchmarkFanSampler(stopCh <-chan struct{}, intervalSec int) <-chan benchmarkPowerCalibrationRunSummary {
 	if intervalSec <= 0 {
 		intervalSec = benchmarkPowerAutotuneSampleInterval
 	}
 	ch := make(chan benchmarkPowerCalibrationRunSummary, 1)
 	go func() {
 		defer close(ch)
 		var rpmSamples []float64
 		var dutySamples []float64
 		record := func() {
 			fans, err := sampleFanSpeeds()
 			if err != nil || len(fans) == 0 {
 				return
 			}
 			if rpm := meanFanRPM(fans); rpm > 0 {
 				rpmSamples = append(rpmSamples, rpm)
 			}
 			if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok && duty > 0 {
 				dutySamples = append(dutySamples, duty)
 			}
 		}
 		record()
 		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
 		defer ticker.Stop()
 		for {
 			select {
 			case <-stopCh:
 				ch <- benchmarkPowerCalibrationRunSummary{
 					AvgFanRPM:          benchmarkMean(rpmSamples),
 					AvgFanDutyCyclePct: benchmarkMean(dutySamples),
 					FanSamples:         len(rpmSamples),
 				}
 				return
 			case <-ticker.C:
 				record()
 			}
 		}
 	}()
 	return ch
 }
 func powerBenchDurationSec(profile string) int {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability:
@@ -3568,41 +3780,39 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W  \n", result.PlatformMaxTDPW)
 	if sp := result.ServerPower; sp != nil && sp.Available {
-		fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W  \n", sp.DeltaW)
+		sourceLabel := "autotuned source"
-		if sp.PSUInputLoadedW > 0 {
+		switch normalizeBenchmarkPowerSource(sp.Source) {
-			psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
+		case BenchmarkPowerSourceSDRPSUInput:
-			fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W  \n", psuDelta)
+			sourceLabel = "autotuned source (SDR PSU AC input)"
 		case BenchmarkPowerSourceDCMI:
 			sourceLabel = "autotuned source (DCMI)"
 		}
-		fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f  \n", sp.ReportingRatio)
+		fmt.Fprintf(&b, "**Server power delta (%s):** %.0f W  \n", sourceLabel, sp.DeltaW)
 		fmt.Fprintf(&b, "**Reporting ratio:** %.2f  \n", sp.ReportingRatio)
 	}
 	b.WriteString("\n")
 	// Server power comparison table.
 	if sp := result.ServerPower; sp != nil {
 		b.WriteString("## Server vs GPU Power Comparison\n\n")
 		selectedSource := normalizeBenchmarkPowerSource(sp.Source)
 		selectedSourceLabel := "Selected source"
 		if selectedSource == BenchmarkPowerSourceSDRPSUInput {
 			selectedSourceLabel = "Selected source (SDR PSU AC input)"
 		} else if selectedSource == BenchmarkPowerSourceDCMI {
 			selectedSourceLabel = "Selected source (DCMI)"
 		}
 		var spRows [][]string
-		spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
+		spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
 		spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
 		if sp.GPUSlotTotalW > 0 {
 			spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
 		}
 		if sp.Available {
-			spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
+			spRows = append(spRows, []string{selectedSourceLabel + " idle power", fmt.Sprintf("%.0f W", sp.IdleW)})
-			spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
+			spRows = append(spRows, []string{selectedSourceLabel + " loaded power", fmt.Sprintf("%.0f W", sp.LoadedW)})
-			spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
+			spRows = append(spRows, []string{selectedSourceLabel + " Δ power (loaded − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)})
 		}
-		if sp.PSUInputLoadedW > 0 {
+		if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 {
-			spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
+			spRows = append(spRows, []string{"PSU AC input (idle avg, pre-load phase)", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
-			spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
+			spRows = append(spRows, []string{"PSU AC input (loaded avg, final phase)", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
 			psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
-			spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
+			spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", fmt.Sprintf("%.0f W", psuDelta)})
 		}
 		if sp.PSUOutputLoadedW > 0 {
 			spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
 			spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
 			if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
 				psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
 				spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
 			}
 		}
 		if sp.Available {
 			ratio := sp.ReportingRatio
@@ -3619,8 +3829,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			default:
 				ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
 			}
-			spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
+			spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
-			if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
+			if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
 				psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
 				sdrRatio := psuDelta / sp.GPUReportedSumW
 				sdrNote := ""
@@ -3632,12 +3842,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				default:
 					sdrNote = "✗ significant discrepancy"
 				}
-				spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
+				spRows = append(spRows, []string{"PSU AC input reporting ratio", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
 			}
 		} else {
-			spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
+			spRows = append(spRows, []string{"IPMI availability", "not available — IPMI not supported or ipmitool not found"})
 		}
-		b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
+		b.WriteString(fmtMDTable([]string{"Metric", "Value"}, spRows))
 		for _, note := range sp.Notes {
 			fmt.Fprintf(&b, "\n> %s\n", note)
 		}
@@ -3689,11 +3899,10 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				psuDistRows = append(psuDistRows, []string{
 					slot,
 					fmtW(idle.InputW), fmtW(loaded.InputW),
 					fmtW(idle.OutputW), fmtW(loaded.OutputW),
 					deltaStr, status,
 				})
 			}
-			b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
+			b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle avg)", "AC Input (loaded avg)", "Load Δ", "Status"}, psuDistRows))
 			b.WriteString("\n")
 		}
 	}
@@ -3741,7 +3950,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				fan,
 			})
 		}
-		b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
+		b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Avg Fan RPM (duty%)"}, sgRows))
 		b.WriteString("\n")
 	}
 	if len(result.RecommendedSlotOrder) > 0 {
@@ -3850,7 +4059,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			for _, slot := range psuSlots {
 				psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
 			}
-			psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
+			psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Avg Fan RPM (duty%)")
 			var psuRows [][]string
 			for _, step := range result.RampSteps {
@@ -3931,7 +4140,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			}
 			pdRows = append(pdRows, []string{
 				fmt.Sprintf("GPU %d", gpu.Index),
 				fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
 				fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
 				fmt.Sprintf("%.0f W", stable),
 				realization,
@@ -3944,13 +4152,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 		}
 		pdRows = append(pdRows, []string{
 			"**Platform**",
 			fmt.Sprintf("**%.0f W**", totalDefault),
 			"—",
 			fmt.Sprintf("**%.0f W**", totalStable),
 			fmt.Sprintf("**%s**", platformReal),
 			"",
 		})
-		b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
+		b.WriteString(fmtMDTable([]string{"GPU", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
 		b.WriteString("\n")
 		// Balance across GPUs — only meaningful with 2+ GPUs.
@@ -4100,7 +4307,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			{"Avg Temp °C", singleTemp},
 			{"Power W", singlePwr},
 			{"Per GPU wall W", singleWall},
-			{"Fan RPM (duty%)", singleFan},
+			{"Avg Fan RPM (duty%)", singleFan},
 		}
 		if lastStep != nil {
 			compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
@@ -4208,18 +4415,22 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// Sample server idle power before any GPU load.
 	var serverIdleW float64
 	var serverIdleOK bool
 	idleSDRStopCh := make(chan struct{})
 	idleSDRCh := startIPMISDRSampler(idleSDRStopCh, benchmarkPowerAutotuneSampleInterval)
 	if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
 		serverIdleW = w
 		serverIdleOK = true
 		logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
 	}
-	sdrIdle := sampleIPMISDRPowerSensors()
+	close(idleSDRStopCh)
 	sdrIdle := summarizeSDRPowerSeries(<-idleSDRCh)
 	psuBefore := psuStatusSnapshot()
 	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
 	// establish a true single-card power baseline unaffected by neighbour heat.
 	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
 	singleIPMILoadedW := make(map[int]float64, len(selected))
 	singleRunSummaryByIndex := make(map[int]benchmarkPowerCalibrationRunSummary, len(selected))
 	var allRestoreActions []benchmarkRestoreAction
 	// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
 	var allPowerRows []GPUMetricRow
@@ -4235,21 +4446,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
 		singlePowerStopCh := make(chan struct{})
 		singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
-		c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
+		c, restore, singleRows, singleRun := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
 		appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
 		close(singlePowerStopCh)
 		sdrSingle := sampleIPMISDRPowerSensors()
 		if samples := <-singlePowerCh; len(samples) > 0 {
 			singleIPMILoadedW[idx] = benchmarkMean(samples)
 			logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
-		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
+		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && singleRun.LoadedSDR.PSUInW > 0 {
-			singleIPMILoadedW[idx] = sdrSingle.PSUInW
+			singleIPMILoadedW[idx] = singleRun.LoadedSDR.PSUInW
-			logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
+			logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR avg): %.0f W", idx, singleRun.LoadedSDR.PSUInW))
 		}
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
 			calibByIndex[idx] = r
 		}
 		singleRunSummaryByIndex[idx] = singleRun
 	}
 	defer func() {
 		for i := len(allRestoreActions) - 1; i >= 0; i-- {
@@ -4292,11 +4503,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			t := summarizeBenchmarkTelemetry(calib.MetricRows)
 			gpu.Telemetry = &t
 		}
-		if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
+		if singleRun := singleRunSummaryByIndex[idx]; singleRun.AvgFanRPM > 0 {
-			gpu.AvgFanRPM = meanFanRPM(fans)
+			gpu.AvgFanRPM = singleRun.AvgFanRPM
-			if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
+			gpu.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
 				gpu.AvgFanDutyCyclePct = duty
 			}
 		}
 		gpus = append(gpus, gpu)
 	}
@@ -4352,10 +4561,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// per-step in NvidiaPowerBenchStep.ServerLoadedW.
 	var serverLoadedW float64
 	var serverLoadedOK bool
-	// sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are
+	// sdrLastStep retains the phase-averaged SDR readings from the last ramp step
-	// still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling
+	// while GPUs are loaded. Used in the summary instead of re-sampling after the
-	// after the test when GPUs have already returned to idle.
+	// test when GPUs have already returned to idle.
-	var sdrLastStep sdrPowerSnapshot
+	var sdrLastStep benchmarkSDRSeriesSummary
 	// Step 1: reuse single-card calibration result directly.
 	if len(result.RecommendedSlotOrder) > 0 {
@@ -4376,6 +4585,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			ramp.ServerLoadedW = w
 			ramp.ServerDeltaW = w - serverIdleW
 		}
 		if singleRun := singleRunSummaryByIndex[firstIdx]; singleRun.AvgFanRPM > 0 {
 			ramp.AvgFanRPM = singleRun.AvgFanRPM
 			ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
 		}
 		if !firstCalib.Completed {
 			ramp.Status = "FAILED"
 			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
@@ -4426,7 +4639,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		stepPowerStopCh := make(chan struct{})
 		stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
-		stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
+		stepCalib, stepRestore, stepRows, stepRun := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
 		appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
 		close(stepPowerStopCh)
 		var stepIPMILoadedW float64
@@ -4497,10 +4710,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
 		}
-		// Per-step PSU slot snapshot — also used as the authoritative loaded power
+		// Per-step PSU slot readings are averaged over the whole load phase rather
-		// source when SDR PSU sensors are available (more accurate than DCMI on
+		// than captured as a single end-of-phase snapshot.
-		// servers where DCMI covers only a subset of installed PSUs).
+		sdrStep := stepRun.LoadedSDR
 		sdrStep := sampleIPMISDRPowerSensors()
 		if len(sdrStep.PSUSlots) > 0 {
 			ramp.PSUSlotReadings = sdrStep.PSUSlots
 		}
@@ -4518,7 +4730,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
 			ramp.ServerLoadedW = sdrStep.PSUInW
 			ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
-			logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
+			logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR avg): %.0f W", step, sdrStep.PSUInW))
 			if step == len(result.RecommendedSlotOrder) {
 				serverLoadedW = sdrStep.PSUInW
 				serverLoadedOK = true
@@ -4526,12 +4738,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			}
 		}
-		// Fan state at end of ramp step.
+		// Fan values are phase averages over the same load window.
-		if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
+		if stepRun.AvgFanRPM > 0 {
-			ramp.AvgFanRPM = meanFanRPM(fans)
+			ramp.AvgFanRPM = stepRun.AvgFanRPM
-			if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
+			ramp.AvgFanDutyCyclePct = stepRun.AvgFanDutyCyclePct
 				ramp.AvgFanDutyCyclePct = duty
 			}
 		}
 		// Per-GPU telemetry from this ramp step's calibration.
@@ -4584,8 +4794,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
 	// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
 	if result.ServerPower != nil {
-		// Use the SDR snapshot from the last ramp step (GPUs still loaded) rather
+		// Use the SDR phase average from the last ramp step (GPUs still loaded)
-		// than re-sampling here, which would capture post-test idle state.
+		// rather than re-sampling here, which would capture post-test idle state.
 		sdrLoaded := sdrLastStep
 		result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
 		result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
@@ -4605,6 +4815,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			result.ServerPower.Notes = append(result.ServerPower.Notes,
 				"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
 		}
 		if sdrLoaded.Samples > 0 {
 			result.ServerPower.Notes = append(result.ServerPower.Notes,
 				fmt.Sprintf("Final SDR PSU loaded values are phase averages across %d sample(s) from the last full-load step.", sdrLoaded.Samples))
 		}
 		// Detect DCMI partial coverage: direct SDR comparison first,
 		// ramp heuristic as fallback when SDR PSU sensors are absent.
 		dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -30,10 +30,10 @@ import (
 // Sources:
 //   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
 //   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
-//   - SATEstimatedNvidiaGPUValidatePerGPUSec:      xFusion v8.6/v8.22 — 77–87 s/GPU
+//   - SATEstimatedNvidiaGPUValidateSec:            xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
-//   - SATEstimatedNvidiaGPUStressPerGPUSec:        xFusion v8.6/v8.22 — 444–448 s/GPU
+//   - SATEstimatedNvidiaGPUStressSec:              xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
-//   - SATEstimatedNvidiaTargetedStressPerGPUSec:   xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
+//   - SATEstimatedNvidiaTargetedStressSec:         xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
-//   - SATEstimatedNvidiaTargetedPowerPerGPUSec:    MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
+//   - SATEstimatedNvidiaTargetedPowerSec:          MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
@@ -48,15 +48,15 @@ const (
 	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
 	SATEstimatedMemoryStressSec = 140
-	// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
+	// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
-	SATEstimatedNvidiaGPUValidatePerGPUSec = 85
+	SATEstimatedNvidiaGPUValidateSec = 85
-	// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
+	// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
-	SATEstimatedNvidiaGPUStressPerGPUSec = 450
+	SATEstimatedNvidiaGPUStressSec = 450
-	// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
+	// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
-	SATEstimatedNvidiaTargetedStressPerGPUSec = 350
+	SATEstimatedNvidiaTargetedStressSec = 350
-	// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
+	// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
-	SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
+	SATEstimatedNvidiaTargetedPowerSec = 350
 	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
 	SATEstimatedNvidiaPulseTestSec = 5000
--- a/audit/internal/webui/page_validate.go
+++ b/audit/internal/webui/page_validate.go
@@ -35,9 +35,11 @@ func validateTotalValidateSec(n int) int {
 	}
 	total := platform.SATEstimatedCPUValidateSec +
 		platform.SATEstimatedMemoryValidateSec +
 		n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
 	if n > 0 {
 		total += platform.SATEstimatedNvidiaGPUValidateSec
 	}
 	return total
 }
@@ -47,12 +49,14 @@ func validateTotalStressSec(n int) int {
 	}
 	total := platform.SATEstimatedCPUStressSec +
 		platform.SATEstimatedMemoryStressSec +
 		n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
 		n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
 		n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
 		platform.SATEstimatedNvidiaPulseTestSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
 	if n > 0 {
 		total += platform.SATEstimatedNvidiaGPUStressSec +
 			platform.SATEstimatedNvidiaTargetedStressSec +
 			platform.SATEstimatedNvidiaTargetedPowerSec
 	}
 	return total
 }
@@ -128,33 +132,16 @@ func renderValidate(opts HandlerOptions) string {
 		inv.NVIDIA,
 		`Runs NVIDIA diagnostics and board inventory checks.`,
 		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
-		func() string {
+		fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
-			perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
+			validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
-			perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
+			validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
 			if n > 0 {
 				return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
 					validateFmtDur(perV), n, validateFmtDur(perV*n),
 					validateFmtDur(perS), n, validateFmtDur(perS*n))
 			}
 			return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
 				validateFmtDur(perV), validateFmtDur(perS))
 		}(),
 	)) +
 		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
-			func() string {
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 				per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
 				s := "Skipped in Validate. "
 				if n > 0 {
 					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
 				} else {
 					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
 				}
 				return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
 			}(),
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-targeted-power">` +
@@ -162,16 +149,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
 			`<code>dcgmi diag targeted_power</code>`,
-			func() string {
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 				per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
 				s := "Skipped in Validate. "
 				if n > 0 {
 					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
 				} else {
 					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
 				}
 				return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
 			}(),
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-pulse">` +
@@ -382,8 +360,8 @@ function runSATWithOverrides(target, overrides) {
  return enqueueSATTarget(target, overrides)
    .then(d => streamSATTask(d.task_id, title, false));
 }
-const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
+const nvidiaPerGPUTargets = [];
-const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
 function satAllGPUIndicesForMulti() {
  return Promise.resolve(satSelectedGPUIndices());
 }
@@ -417,40 +395,9 @@ function runNvidiaFabricValidate(target) {
  });
 }
 function runNvidiaValidateSet(target) {
-  return loadSatNvidiaGPUs().then(gpus => {
+  const selected = satSelectedGPUIndices();
-    const selected = satSelectedGPUIndices();
+  if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
-    const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
+  return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
    if (!picked.length) {
      throw new Error('Select at least one NVIDIA GPU.');
    }
    if (picked.length === 1) {
      const gpu = picked[0];
      return runSATWithOverrides(target, {
        gpu_indices: [Number(gpu.index)],
        display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
      });
    }
    document.getElementById('sat-output').style.display='block';
    document.getElementById('sat-title').textContent = '— ' + target;
    const term = document.getElementById('sat-terminal');
    term.textContent = 'Running ' + target + ' one GPU at a time...\n';
    const labelBase = satLabels()[target] || ('Validate ' + target);
    const runNext = (idx) => {
      if (idx >= picked.length) return Promise.resolve();
      const gpu = picked[idx];
      const gpuLabel = satGPUDisplayName(gpu);
      term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
      return enqueueSATTarget(target, {
        gpu_indices: [Number(gpu.index)],
        display_name: labelBase + ' (' + gpuLabel + ')',
      }).then(d => {
        return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
      }).then(function() {
        return runNext(idx + 1);
      });
    };
    return runNext(0);
  });
 }
 function runAMDValidateSet() {
  const targets = selectedAMDValidateTargets();
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -126,6 +126,37 @@ resolve_iso_version() {
    resolve_audit_version
 }
 sync_builder_workdir() {
    src_dir="$1"
    dst_dir="$2"
    mkdir -p "$dst_dir"
    # Historical bug: old workdirs could keep config/bootloaders/grub-pc even
    # after the source tree moved to grub-efi only. Remove bootloaders eagerly
    # so reused workdirs cannot leak stale templates into a new ISO build.
    rm -rf "$dst_dir/config/bootloaders"
    rsync -a --delete \
        --exclude='cache/' \
        --exclude='chroot/' \
        --exclude='.build/' \
        --exclude='*.iso' \
        --exclude='*.packages' \
        --exclude='*.contents' \
        --exclude='*.files' \
        "$src_dir/" "$dst_dir/"
    if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
        echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
        exit 1
    fi
    if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
        echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
        exit 1
    fi
 }
 iso_list_files() {
    iso_path="$1"
@@ -466,6 +497,75 @@ validate_iso_memtest() {
    echo "=== memtest validation OK ==="
 }
 validate_iso_live_boot_entries() {
    iso_path="$1"
    echo "=== validating live boot entries in ISO ==="
    [ -f "$iso_path" ] || {
        echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
        exit 1
    }
    require_iso_reader "$iso_path" >/dev/null 2>&1 || {
        echo "ERROR: ISO reader unavailable for live boot validation" >&2
        exit 1
    }
    grub_cfg="$(mktemp)"
    isolinux_cfg="$(mktemp)"
    iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
        echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
        echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
        echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    fi
    grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
        echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
        echo "ERROR: GRUB toram entry is missing" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'linux .*boot=live ' "$grub_cfg" || {
        echo "ERROR: GRUB live entry is missing boot=live" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
        echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'append .*boot=live ' "$isolinux_cfg" || {
        echo "ERROR: isolinux live entry is missing boot=live" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
        echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    rm -f "$grub_cfg" "$isolinux_cfg"
    echo "=== live boot validation OK ==="
 }
 validate_iso_nvidia_runtime() {
    iso_path="$1"
    [ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
@@ -558,6 +658,21 @@ extract_live_grub_entry() {
    return 0
 }
 load_live_build_append() {
    lb_dir="$1"
    binary_cfg="$lb_dir/config/binary"
    [ -f "$binary_cfg" ] || return 1
    # config/binary is generated by live-build and contains shell variable
    # assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
    # shellcheck disable=SC1090
    . "$binary_cfg"
    [ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
    live_build_append="$LB_BOOTAPPEND_LIVE"
    return 0
 }
 extract_live_isolinux_entry() {
    cfg="$1"
    isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
@@ -594,36 +709,15 @@ echo "  Hardware Audit LiveCD"
 echo ""
 menuentry "EASY-BEE" {
-    linux   ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    linux   ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  ${initrd}
 }
-menuentry "EASY-BEE — load to RAM (toram)" {
+menuentry "EASY-BEE -- load to RAM (toram)" {
-    linux   ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    linux   ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  ${initrd}
 }
 submenu "EASY-BEE (advanced options) -->" {
    menuentry "EASY-BEE — GSP=off" {
        linux   ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  ${initrd}
    }
    menuentry "EASY-BEE — KMS (no nomodeset)" {
        linux   ${kernel} ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  ${initrd}
    }
    menuentry "EASY-BEE — KMS + GSP=off" {
        linux   ${kernel} ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  ${initrd}
    }
    menuentry "EASY-BEE — fail-safe" {
        linux   ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
        initrd  ${initrd}
    }
 }
 if [ "\${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
@@ -699,13 +793,18 @@ enforce_live_build_bootloader_assets() {
    grub_dir="$lb_dir/binary/boot/grub"
    isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
    if ! load_live_build_append "$lb_dir"; then
        echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
        live_build_append=""
    fi
    if [ -f "$grub_cfg" ]; then
        if extract_live_grub_entry "$grub_cfg"; then
            mkdir -p "$grub_dir/live-theme"
            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
            cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
-            write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "$grub_append" "$grub_initrd"
+            write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
            echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
        else
            echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
@@ -714,7 +813,7 @@ enforce_live_build_bootloader_assets() {
    if [ -f "$isolinux_cfg" ]; then
        if extract_live_isolinux_entry "$isolinux_cfg"; then
-            write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "$isolinux_append"
+            write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
            echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
        else
            echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
@@ -1112,15 +1211,7 @@ echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
 # Sync builder config into variant work dir, preserving lb cache.
-rsync -a --delete \
+sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"
    --exclude='cache/' \
    --exclude='chroot/' \
    --exclude='.build/' \
    --exclude='*.iso' \
    --exclude='*.packages' \
    --exclude='*.contents' \
    --exclude='*.files' \
    "${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
 # Share deb package cache across variants.
 # Restore: populate work dir cache from shared cache before build.
@@ -1411,8 +1502,11 @@ dump_memtest_debug "pre-build" "${LB_DIR}"
 run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
 echo "=== enforcing canonical bootloader assets ==="
 enforce_live_build_bootloader_assets "${LB_DIR}"
 reset_live_build_stage "${LB_DIR}" "binary_checksums"
 reset_live_build_stage "${LB_DIR}" "binary_iso"
 reset_live_build_stage "${LB_DIR}" "binary_zsync"
 run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
-run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "rm -f '${LB_DIR}/live-image-amd64.hybrid.iso' && lb binary_iso 2>&1"
+run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
 run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
 # --- persist deb package cache back to shared location ---
@@ -1438,6 +1532,7 @@ if [ -f "$ISO_RAW" ]; then
        fi
    fi
    validate_iso_memtest "$ISO_RAW"
    validate_iso_live_boot_entries "$ISO_RAW"
    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
--- a/iso/builder/config/bootloaders/grub-efi/config.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/config.cfg
@@ -23,9 +23,9 @@ insmod serial
 serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
 insmod gfxterm
 insmod png
 source /boot/grub/theme.cfg
 terminal_input console serial
 terminal_output gfxterm serial
 insmod png
 source /boot/grub/theme.cfg
--- a/iso/builder/config/bootloaders/grub-efi/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/grub.cfg
@@ -1,47 +1,16 @@
 source /boot/grub/config.cfg
 echo ""
 echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
 echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
 echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
 echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
 echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
 echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
 echo "  Hardware Audit LiveCD"
 echo ""
 menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
-submenu "EASY-BEE (advanced options) -->" {
+menuentry "EASY-BEE -- load to RAM (toram)" {
-    menuentry "EASY-BEE — load to RAM (toram)" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    initrd  @INITRD_LIVE@
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — GSP=off" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — KMS (no nomodeset)" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — KMS + GSP=off" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — fail-safe" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
        initrd  @INITRD_LIVE@
    }
 }
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
        chainloader /boot/memtest86+x64.efi
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
@@ -5,12 +5,10 @@ title-text: ""
 message-font: "Unifont Regular 16"
 terminal-font: "Unifont Regular 16"
-#bee logo — centered, upper third of screen
+#bee logo - centered, upper third of screen
 + image {
        top = 4%
        left = 50%-200
        width = 400
        height = 400
        file = "bee-logo.png"
 }
@@ -36,11 +34,11 @@ terminal-font: "Unifont Regular 16"
        item_font = "Unifont Regular 16"
        selected_item_color= "#f5a800"
        selected_item_font = "Unifont Regular 16"
-        item_height = 16
+        item_height = 20
-        item_padding = 0
+        item_padding = 2
        item_spacing = 4
        icon_width = 0
-        icon_heigh = 0
+        icon_height = 0
        item_icon_space = 0
 }
Author	SHA1	Message	Date
Michael Chus	6112094d45	fix(grub): fix bitmap error and menu rendering - Convert bee-logo.png to RGBA (color type 6) and strip all metadata chunks (cHRM, bKGD, tIME, tEXt) that confuse GRUB's minimal PNG parser - Move terminal_output gfxterm before insmod png / theme load so the theme initialises in an active gfxterm context - Remove echo ASCII art banner from grub.cfg — with gfxterm active and no terminal_box in the theme, echo output renders over the menu area - Fix icon_heigh typo → icon_height; increase item_height 16→20 with item_padding 0→2 for reliable text rendering in boot_menu Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-22 22:05:16 +03:00
Michael Chus	e9a2bc9f9d	update submodule	2026-04-22 20:39:27 +03:00
Mikhail Chusavitin	7a8f884664	fix(boot): remove advanced options submenu Keep only EASY-BEE and toram entries. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-22 19:01:50 +03:00
Mikhail Chusavitin	8bf8dfa45b	fix(boot): default to KMS + pci=realloc, drop nomodeset from main entries Default and toram entries now boot with bee.display=kms (ASPEED AST loads via KMS, Xorg uses modesetting driver) and pci=realloc (Linux reassigns GPU BARs when BIOS lacks Above 4G Decoding). nomodeset removed from these entries; still present in GSP=off and fail-safe. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-22 19:00:04 +03:00
Mikhail Chusavitin	6a22199aff	chore(bible): bump ascii-safe-text contract Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-22 18:52:10 +03:00
Mikhail Chusavitin	ddb2bb5d1c	fix(grub): replace em-dash with ASCII -- in all menu entry titles Em-dash (U+2014) renders as garbage on GRUB serial/SOL output (IPMI BMC consoles). Replace with ASCII double-hyphen throughout grub.cfg template, write_canonical_grub_cfg, and theme.txt comment. Also align template grub.cfg structure with write_canonical_grub_cfg: toram entry moved to top level (was inside submenu). bible: add ascii-safe-text contract documenting the no-em-dash rule. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-22 18:52:04 +03:00
Mikhail Chusavitin	aa284ae754	fix(iso): avoid grub logo scaling error	2026-04-20 14:06:32 +03:00
Mikhail Chusavitin	8512098174	fix(iso): restore bootappend-live in canonical boot menu	2026-04-20 13:39:05 +03:00
Mikhail Chusavitin	6b5d22c194	chore(git): ignore local audit binary	2026-04-20 13:21:35 +03:00
Mikhail Chusavitin	a35e90a93e	fix(iso): clear stale bootloader templates in workdir	2026-04-20 13:19:50 +03:00
Mikhail Chusavitin	1ced81707f	fix(iso): validate live boot entries in final ISO	2026-04-20 13:12:24 +03:00
Mikhail Chusavitin	679aeb9947	Run NVIDIA DCGM diag tests on all selected GPUs simultaneously targeted_stress, targeted_power, and the Level 2/3 diag were dispatched one GPU at a time from the UI, turning a single dcgmi command into 8 sequential ~350–450 s runs. DCGM supports -i with a comma-separated list of GPU indices and runs the diagnostic on all of them in parallel. Move nvidia, nvidia-targeted-stress, nvidia-targeted-power into nvidiaAllGPUTargets so expandSATTarget passes all selected indices in one API call. Simplify runNvidiaValidateSet to match runNvidiaFabricValidate. Update sat.go constants and page_validate.go estimates to reflect all-GPU simultaneous execution (remove n× multiplier from total time estimates). Stress test on 8-GPU system: ~5.3 h → ~2.5 h. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-20 11:53:25 +03:00
Mikhail Chusavitin	647e99b697	Fix post-sync live-build ISO rebuild	2026-04-20 11:01:15 +03:00
Mikhail Chusavitin	4af997f436	Update audit bee binary	2026-04-20 10:55:42 +03:00
Mikhail Chusavitin	6caace0cc0	Make power benchmark report phase-averaged	2026-04-20 10:53:53 +03:00