Compare commits

..

15 Commits
v8.32 ... v8.39

Author SHA1 Message Date
6112094d45 fix(grub): fix bitmap error and menu rendering
- Convert bee-logo.png to RGBA (color type 6) and strip all metadata
  chunks (cHRM, bKGD, tIME, tEXt) that confuse GRUB's minimal PNG parser
- Move terminal_output gfxterm before insmod png / theme load so the
  theme initialises in an active gfxterm context
- Remove echo ASCII art banner from grub.cfg — with gfxterm active and
  no terminal_box in the theme, echo output renders over the menu area
- Fix icon_heigh typo → icon_height; increase item_height 16→20 with
  item_padding 0→2 for reliable text rendering in boot_menu

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 22:05:16 +03:00
e9a2bc9f9d update submodule 2026-04-22 20:39:27 +03:00
Mikhail Chusavitin
7a8f884664 fix(boot): remove advanced options submenu
Keep only EASY-BEE and toram entries.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 19:01:50 +03:00
Mikhail Chusavitin
8bf8dfa45b fix(boot): default to KMS + pci=realloc, drop nomodeset from main entries
Default and toram entries now boot with bee.display=kms (ASPEED AST
loads via KMS, Xorg uses modesetting driver) and pci=realloc (Linux
reassigns GPU BARs when BIOS lacks Above 4G Decoding). nomodeset
removed from these entries; still present in GSP=off and fail-safe.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 19:00:04 +03:00
Mikhail Chusavitin
6a22199aff chore(bible): bump ascii-safe-text contract
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 18:52:10 +03:00
Mikhail Chusavitin
ddb2bb5d1c fix(grub): replace em-dash with ASCII -- in all menu entry titles
Em-dash (U+2014) renders as garbage on GRUB serial/SOL output
(IPMI BMC consoles). Replace with ASCII double-hyphen throughout
grub.cfg template, write_canonical_grub_cfg, and theme.txt comment.

Also align template grub.cfg structure with write_canonical_grub_cfg:
toram entry moved to top level (was inside submenu).

bible: add ascii-safe-text contract documenting the no-em-dash rule.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 18:52:04 +03:00
Mikhail Chusavitin
aa284ae754 fix(iso): avoid grub logo scaling error 2026-04-20 14:06:32 +03:00
Mikhail Chusavitin
8512098174 fix(iso): restore bootappend-live in canonical boot menu 2026-04-20 13:39:05 +03:00
Mikhail Chusavitin
6b5d22c194 chore(git): ignore local audit binary 2026-04-20 13:21:35 +03:00
Mikhail Chusavitin
a35e90a93e fix(iso): clear stale bootloader templates in workdir 2026-04-20 13:19:50 +03:00
Mikhail Chusavitin
1ced81707f fix(iso): validate live boot entries in final ISO 2026-04-20 13:12:24 +03:00
Mikhail Chusavitin
679aeb9947 Run NVIDIA DCGM diag tests on all selected GPUs simultaneously
targeted_stress, targeted_power, and the Level 2/3 diag were dispatched
one GPU at a time from the UI, turning a single dcgmi command into 8
sequential ~350–450 s runs. DCGM supports -i with a comma-separated list
of GPU indices and runs the diagnostic on all of them in parallel.

Move nvidia, nvidia-targeted-stress, nvidia-targeted-power into
nvidiaAllGPUTargets so expandSATTarget passes all selected indices in one
API call. Simplify runNvidiaValidateSet to match runNvidiaFabricValidate.
Update sat.go constants and page_validate.go estimates to reflect all-GPU
simultaneous execution (remove n× multiplier from total time estimates).

Stress test on 8-GPU system: ~5.3 h → ~2.5 h.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-20 11:53:25 +03:00
Mikhail Chusavitin
647e99b697 Fix post-sync live-build ISO rebuild 2026-04-20 11:01:15 +03:00
Mikhail Chusavitin
4af997f436 Update audit bee binary 2026-04-20 10:55:42 +03:00
Mikhail Chusavitin
6caace0cc0 Make power benchmark report phase-averaged 2026-04-20 10:53:53 +03:00
10 changed files with 459 additions and 235 deletions

1
.gitignore vendored
View File

@@ -3,3 +3,4 @@
dist/ dist/
iso/out/ iso/out/
build-cache/ build-cache/
audit/bee

BIN
audit/bee

Binary file not shown.

View File

@@ -67,6 +67,13 @@ type benchmarkPowerCalibrationResult struct {
MetricRows []GPUMetricRow MetricRows []GPUMetricRow
} }
type benchmarkPowerCalibrationRunSummary struct {
LoadedSDR benchmarkSDRSeriesSummary
AvgFanRPM float64
AvgFanDutyCyclePct float64
FanSamples int
}
type benchmarkBurnProfile struct { type benchmarkBurnProfile struct {
name string name string
category string category string
@@ -2413,6 +2420,16 @@ type sdrPowerSnapshot struct {
SkippedSensors []string // sensors rejected during self-healing SkippedSensors []string // sensors rejected during self-healing
} }
type benchmarkSDRSeriesSummary struct {
PSUInW float64
PSUOutW float64
GPUSlotW float64
PSUSlots map[string]BenchmarkPSUSlotPower
Samples int
SkippedSensors []string
}
// sdrSensor is a name+watts pair used for GPU slot self-healing filtering. // sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
type sdrSensor struct { type sdrSensor struct {
name string name string
@@ -2542,6 +2559,137 @@ func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
return snap return snap
} }
func startIPMISDRSampler(stopCh <-chan struct{}, intervalSec int) <-chan []sdrPowerSnapshot {
if intervalSec <= 0 {
intervalSec = benchmarkPowerAutotuneSampleInterval
}
ch := make(chan []sdrPowerSnapshot, 1)
go func() {
defer close(ch)
var samples []sdrPowerSnapshot
record := func() {
snap := sampleIPMISDRPowerSensors()
if snap.PSUInW <= 0 && snap.PSUOutW <= 0 && snap.GPUSlotW <= 0 && len(snap.PSUSlots) == 0 {
return
}
samples = append(samples, snap)
}
record()
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
ch <- samples
return
case <-ticker.C:
record()
}
}
}()
return ch
}
func summarizeSDRPowerSeries(samples []sdrPowerSnapshot) benchmarkSDRSeriesSummary {
var summary benchmarkSDRSeriesSummary
if len(samples) == 0 {
return summary
}
type slotAggregate struct {
inputs []float64
outputs []float64
status string
}
slotAgg := make(map[string]*slotAggregate)
skippedSet := make(map[string]struct{})
var inputTotals []float64
var outputTotals []float64
var gpuSlotTotals []float64
for _, sample := range samples {
if sample.PSUInW > 0 {
inputTotals = append(inputTotals, sample.PSUInW)
}
if sample.PSUOutW > 0 {
outputTotals = append(outputTotals, sample.PSUOutW)
}
if sample.GPUSlotW > 0 {
gpuSlotTotals = append(gpuSlotTotals, sample.GPUSlotW)
}
for _, skipped := range sample.SkippedSensors {
if skipped != "" {
skippedSet[skipped] = struct{}{}
}
}
for slot, reading := range sample.PSUSlots {
agg := slotAgg[slot]
if agg == nil {
agg = &slotAggregate{}
slotAgg[slot] = agg
}
if reading.InputW != nil && *reading.InputW > 0 {
agg.inputs = append(agg.inputs, *reading.InputW)
}
if reading.OutputW != nil && *reading.OutputW > 0 {
agg.outputs = append(agg.outputs, *reading.OutputW)
}
switch {
case reading.Status == "":
case agg.status == "":
agg.status = reading.Status
case agg.status == "OK" && reading.Status != "OK":
agg.status = reading.Status
}
}
}
summary.PSUInW = benchmarkMean(inputTotals)
summary.PSUOutW = benchmarkMean(outputTotals)
summary.GPUSlotW = benchmarkMean(gpuSlotTotals)
summary.Samples = len(samples)
if len(slotAgg) > 0 {
summary.PSUSlots = make(map[string]BenchmarkPSUSlotPower, len(slotAgg))
for slot, agg := range slotAgg {
reading := BenchmarkPSUSlotPower{Status: agg.status}
if mean := benchmarkMean(agg.inputs); mean > 0 {
v := mean
reading.InputW = &v
}
if mean := benchmarkMean(agg.outputs); mean > 0 {
v := mean
reading.OutputW = &v
}
summary.PSUSlots[slot] = reading
}
}
if len(skippedSet) > 0 {
summary.SkippedSensors = make([]string, 0, len(skippedSet))
for skipped := range skippedSet {
summary.SkippedSensors = append(summary.SkippedSensors, skipped)
}
sort.Strings(summary.SkippedSensors)
}
return summary
}
func collectIPMISDRPowerSeries(ctx context.Context, durationSec, intervalSec int) benchmarkSDRSeriesSummary {
if durationSec <= 0 {
return benchmarkSDRSeriesSummary{}
}
stopCh := make(chan struct{})
doneCh := startIPMISDRSampler(stopCh, intervalSec)
select {
case <-ctx.Done():
case <-time.After(time.Duration(durationSec) * time.Second):
}
close(stopCh)
return summarizeSDRPowerSeries(<-doneCh)
}
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi. // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed. // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) { func queryIPMIServerPowerW() (float64, error) {
@@ -3086,8 +3234,9 @@ func runBenchmarkPowerCalibration(
logFunc func(string), logFunc func(string),
seedLimits map[int]int, seedLimits map[int]int,
durationSec int, durationSec int,
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) { ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow, benchmarkPowerCalibrationRunSummary) {
calibDurationSec := durationSec calibDurationSec := durationSec
var runSummary benchmarkPowerCalibrationRunSummary
if calibDurationSec <= 0 { if calibDurationSec <= 0 {
calibDurationSec = 120 calibDurationSec = 120
} }
@@ -3105,12 +3254,12 @@ func runBenchmarkPowerCalibration(
if engine == BenchmarkPowerEngineTargetedPower { if engine == BenchmarkPowerEngineTargetedPower {
if _, err := exec.LookPath("dcgmi"); err != nil { if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil, nil return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
} }
} else { } else {
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil { if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)") logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil, nil return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
} }
} }
if killed := KillTestWorkers(); len(killed) > 0 { if killed := KillTestWorkers(); len(killed) > 0 {
@@ -3275,6 +3424,10 @@ calibDone:
} }
attemptCtx, cancelAttempt := context.WithCancel(ctx) attemptCtx, cancelAttempt := context.WithCancel(ctx)
doneCh := make(chan sharedAttemptResult, 1) doneCh := make(chan sharedAttemptResult, 1)
sdrStopCh := make(chan struct{})
sdrDoneCh := startIPMISDRSampler(sdrStopCh, benchmarkPowerAutotuneSampleInterval)
fanStopCh := make(chan struct{})
fanDoneCh := startBenchmarkFanSampler(fanStopCh, benchmarkPowerAutotuneSampleInterval)
go func() { go func() {
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc) out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err} doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
@@ -3314,6 +3467,10 @@ calibDone:
} }
ticker.Stop() ticker.Stop()
cancelAttempt() cancelAttempt()
close(sdrStopCh)
close(fanStopCh)
attemptSDRSummary := summarizeSDRPowerSeries(<-sdrDoneCh)
attemptFanSummary := <-fanDoneCh
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644) _ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
// Accumulate telemetry rows with attempt stage label. // Accumulate telemetry rows with attempt stage label.
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec)) appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
@@ -3351,10 +3508,14 @@ calibDone:
busyDelaySec = 1 busyDelaySec = 1
// Per-GPU analysis and binary search update. // Per-GPU analysis and binary search update.
attemptStable := ar.err == nil
for _, s := range active { for _, s := range active {
perGPU := filterRowsByGPU(ar.rows, s.idx) perGPU := filterRowsByGPU(ar.rows, s.idx)
summary := summarizeBenchmarkTelemetry(perGPU) summary := summarizeBenchmarkTelemetry(perGPU)
throttle := throttleReasons[s.idx] throttle := throttleReasons[s.idx]
if throttle != "" || summary.P95PowerW <= 0 {
attemptStable = false
}
// Cooling warning: thermal throttle with fans not at maximum. // Cooling warning: thermal throttle with fans not at maximum.
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" { if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
@@ -3487,6 +3648,16 @@ calibDone:
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi)) logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
} }
if attemptStable {
if attemptSDRSummary.Samples > 0 {
runSummary.LoadedSDR = attemptSDRSummary
}
if attemptFanSummary.FanSamples > 0 {
runSummary.AvgFanRPM = attemptFanSummary.AvgFanRPM
runSummary.AvgFanDutyCyclePct = attemptFanSummary.AvgFanDutyCyclePct
runSummary.FanSamples = attemptFanSummary.FanSamples
}
}
} }
for _, s := range states { for _, s := range states {
@@ -3495,7 +3666,7 @@ calibDone:
} }
} }
writeBenchmarkMetricsFiles(runDir, allCalibRows) writeBenchmarkMetricsFiles(runDir, allCalibRows)
return results, restore, allCalibRows return results, restore, allCalibRows, runSummary
} }
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222), // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3540,6 +3711,47 @@ func meanFanRPM(fans []FanReading) float64 {
return sum / float64(len(fans)) return sum / float64(len(fans))
} }
func startBenchmarkFanSampler(stopCh <-chan struct{}, intervalSec int) <-chan benchmarkPowerCalibrationRunSummary {
if intervalSec <= 0 {
intervalSec = benchmarkPowerAutotuneSampleInterval
}
ch := make(chan benchmarkPowerCalibrationRunSummary, 1)
go func() {
defer close(ch)
var rpmSamples []float64
var dutySamples []float64
record := func() {
fans, err := sampleFanSpeeds()
if err != nil || len(fans) == 0 {
return
}
if rpm := meanFanRPM(fans); rpm > 0 {
rpmSamples = append(rpmSamples, rpm)
}
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok && duty > 0 {
dutySamples = append(dutySamples, duty)
}
}
record()
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
ch <- benchmarkPowerCalibrationRunSummary{
AvgFanRPM: benchmarkMean(rpmSamples),
AvgFanDutyCyclePct: benchmarkMean(dutySamples),
FanSamples: len(rpmSamples),
}
return
case <-ticker.C:
record()
}
}
}()
return ch
}
func powerBenchDurationSec(profile string) int { func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) { switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability: case NvidiaBenchmarkProfileStability:
@@ -3568,41 +3780,39 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus) fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW) fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
if sp := result.ServerPower; sp != nil && sp.Available { if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W \n", sp.DeltaW) sourceLabel := "autotuned source"
if sp.PSUInputLoadedW > 0 { switch normalizeBenchmarkPowerSource(sp.Source) {
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW case BenchmarkPowerSourceSDRPSUInput:
fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W \n", psuDelta) sourceLabel = "autotuned source (SDR PSU AC input)"
case BenchmarkPowerSourceDCMI:
sourceLabel = "autotuned source (DCMI)"
} }
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f \n", sp.ReportingRatio) fmt.Fprintf(&b, "**Server power delta (%s):** %.0f W \n", sourceLabel, sp.DeltaW)
fmt.Fprintf(&b, "**Reporting ratio:** %.2f \n", sp.ReportingRatio)
} }
b.WriteString("\n") b.WriteString("\n")
// Server power comparison table. // Server power comparison table.
if sp := result.ServerPower; sp != nil { if sp := result.ServerPower; sp != nil {
b.WriteString("## Server vs GPU Power Comparison\n\n") b.WriteString("## Server vs GPU Power Comparison\n\n")
selectedSource := normalizeBenchmarkPowerSource(sp.Source)
selectedSourceLabel := "Selected source"
if selectedSource == BenchmarkPowerSourceSDRPSUInput {
selectedSourceLabel = "Selected source (SDR PSU AC input)"
} else if selectedSource == BenchmarkPowerSourceDCMI {
selectedSourceLabel = "Selected source (DCMI)"
}
var spRows [][]string var spRows [][]string
spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)}) spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
if sp.GPUSlotTotalW > 0 {
spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
}
if sp.Available { if sp.Available {
spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)}) spRows = append(spRows, []string{selectedSourceLabel + " idle power", fmt.Sprintf("%.0f W", sp.IdleW)})
spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)}) spRows = append(spRows, []string{selectedSourceLabel + " loaded power", fmt.Sprintf("%.0f W", sp.LoadedW)})
spRows = append(spRows, []string{"Server Δ power (loaded idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)}) spRows = append(spRows, []string{selectedSourceLabel + " Δ power (loaded idle)", fmt.Sprintf("%.0f W", sp.DeltaW)})
} }
if sp.PSUInputLoadedW > 0 { if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 {
spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)}) spRows = append(spRows, []string{"PSU AC input (idle avg, pre-load phase)", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)}) spRows = append(spRows, []string{"PSU AC input (loaded avg, final phase)", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
spRows = append(spRows, []string{"PSU AC input Δ (loaded idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)}) spRows = append(spRows, []string{"PSU AC input Δ (loaded idle)", fmt.Sprintf("%.0f W", psuDelta)})
}
if sp.PSUOutputLoadedW > 0 {
spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
}
} }
if sp.Available { if sp.Available {
ratio := sp.ReportingRatio ratio := sp.ReportingRatio
@@ -3619,8 +3829,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default: default:
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power" ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
} }
spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)}) spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 { if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
sdrRatio := psuDelta / sp.GPUReportedSumW sdrRatio := psuDelta / sp.GPUReportedSumW
sdrNote := "" sdrNote := ""
@@ -3632,12 +3842,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default: default:
sdrNote = "✗ significant discrepancy" sdrNote = "✗ significant discrepancy"
} }
spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)}) spRows = append(spRows, []string{"PSU AC input reporting ratio", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
} }
} else { } else {
spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"}) spRows = append(spRows, []string{"IPMI availability", "not available — IPMI not supported or ipmitool not found"})
} }
b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows)) b.WriteString(fmtMDTable([]string{"Metric", "Value"}, spRows))
for _, note := range sp.Notes { for _, note := range sp.Notes {
fmt.Fprintf(&b, "\n> %s\n", note) fmt.Fprintf(&b, "\n> %s\n", note)
} }
@@ -3689,11 +3899,10 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
psuDistRows = append(psuDistRows, []string{ psuDistRows = append(psuDistRows, []string{
slot, slot,
fmtW(idle.InputW), fmtW(loaded.InputW), fmtW(idle.InputW), fmtW(loaded.InputW),
fmtW(idle.OutputW), fmtW(loaded.OutputW),
deltaStr, status, deltaStr, status,
}) })
} }
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows)) b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle avg)", "AC Input (loaded avg)", "Load Δ", "Status"}, psuDistRows))
b.WriteString("\n") b.WriteString("\n")
} }
} }
@@ -3741,7 +3950,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fan, fan,
}) })
} }
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows)) b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Avg Fan RPM (duty%)"}, sgRows))
b.WriteString("\n") b.WriteString("\n")
} }
if len(result.RecommendedSlotOrder) > 0 { if len(result.RecommendedSlotOrder) > 0 {
@@ -3850,7 +4059,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
for _, slot := range psuSlots { for _, slot := range psuSlots {
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot)) psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
} }
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)") psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Avg Fan RPM (duty%)")
var psuRows [][]string var psuRows [][]string
for _, step := range result.RampSteps { for _, step := range result.RampSteps {
@@ -3931,7 +4140,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
} }
pdRows = append(pdRows, []string{ pdRows = append(pdRows, []string{
fmt.Sprintf("GPU %d", gpu.Index), fmt.Sprintf("GPU %d", gpu.Index),
fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW), fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
fmt.Sprintf("%.0f W", stable), fmt.Sprintf("%.0f W", stable),
realization, realization,
@@ -3944,13 +4152,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
} }
pdRows = append(pdRows, []string{ pdRows = append(pdRows, []string{
"**Platform**", "**Platform**",
fmt.Sprintf("**%.0f W**", totalDefault),
"—", "—",
fmt.Sprintf("**%.0f W**", totalStable), fmt.Sprintf("**%.0f W**", totalStable),
fmt.Sprintf("**%s**", platformReal), fmt.Sprintf("**%s**", platformReal),
"", "",
}) })
b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows)) b.WriteString(fmtMDTable([]string{"GPU", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
b.WriteString("\n") b.WriteString("\n")
// Balance across GPUs — only meaningful with 2+ GPUs. // Balance across GPUs — only meaningful with 2+ GPUs.
@@ -4100,7 +4307,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
{"Avg Temp °C", singleTemp}, {"Avg Temp °C", singleTemp},
{"Power W", singlePwr}, {"Power W", singlePwr},
{"Per GPU wall W", singleWall}, {"Per GPU wall W", singleWall},
{"Fan RPM (duty%)", singleFan}, {"Avg Fan RPM (duty%)", singleFan},
} }
if lastStep != nil { if lastStep != nil {
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem)) compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
@@ -4208,18 +4415,22 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// Sample server idle power before any GPU load. // Sample server idle power before any GPU load.
var serverIdleW float64 var serverIdleW float64
var serverIdleOK bool var serverIdleOK bool
idleSDRStopCh := make(chan struct{})
idleSDRCh := startIPMISDRSampler(idleSDRStopCh, benchmarkPowerAutotuneSampleInterval)
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok { if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
serverIdleW = w serverIdleW = w
serverIdleOK = true serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w)) logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
} }
sdrIdle := sampleIPMISDRPowerSensors() close(idleSDRStopCh)
sdrIdle := summarizeSDRPowerSeries(<-idleSDRCh)
psuBefore := psuStatusSnapshot() psuBefore := psuStatusSnapshot()
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to // Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat. // establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected)) calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
singleIPMILoadedW := make(map[int]float64, len(selected)) singleIPMILoadedW := make(map[int]float64, len(selected))
singleRunSummaryByIndex := make(map[int]benchmarkPowerCalibrationRunSummary, len(selected))
var allRestoreActions []benchmarkRestoreAction var allRestoreActions []benchmarkRestoreAction
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv. // allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
var allPowerRows []GPUMetricRow var allPowerRows []GPUMetricRow
@@ -4235,21 +4446,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx)) logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
singlePowerStopCh := make(chan struct{}) singlePowerStopCh := make(chan struct{})
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval) singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec) c, restore, singleRows, singleRun := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0) appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
close(singlePowerStopCh) close(singlePowerStopCh)
sdrSingle := sampleIPMISDRPowerSensors()
if samples := <-singlePowerCh; len(samples) > 0 { if samples := <-singlePowerCh; len(samples) > 0 {
singleIPMILoadedW[idx] = benchmarkMean(samples) singleIPMILoadedW[idx] = benchmarkMean(samples)
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx])) logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 { } else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && singleRun.LoadedSDR.PSUInW > 0 {
singleIPMILoadedW[idx] = sdrSingle.PSUInW singleIPMILoadedW[idx] = singleRun.LoadedSDR.PSUInW
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW)) logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR avg): %.0f W", idx, singleRun.LoadedSDR.PSUInW))
} }
allRestoreActions = append(allRestoreActions, restore...) allRestoreActions = append(allRestoreActions, restore...)
if r, ok := c[idx]; ok { if r, ok := c[idx]; ok {
calibByIndex[idx] = r calibByIndex[idx] = r
} }
singleRunSummaryByIndex[idx] = singleRun
} }
defer func() { defer func() {
for i := len(allRestoreActions) - 1; i >= 0; i-- { for i := len(allRestoreActions) - 1; i >= 0; i-- {
@@ -4292,11 +4503,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
t := summarizeBenchmarkTelemetry(calib.MetricRows) t := summarizeBenchmarkTelemetry(calib.MetricRows)
gpu.Telemetry = &t gpu.Telemetry = &t
} }
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 { if singleRun := singleRunSummaryByIndex[idx]; singleRun.AvgFanRPM > 0 {
gpu.AvgFanRPM = meanFanRPM(fans) gpu.AvgFanRPM = singleRun.AvgFanRPM
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok { gpu.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
gpu.AvgFanDutyCyclePct = duty
}
} }
gpus = append(gpus, gpu) gpus = append(gpus, gpu)
} }
@@ -4352,10 +4561,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// per-step in NvidiaPowerBenchStep.ServerLoadedW. // per-step in NvidiaPowerBenchStep.ServerLoadedW.
var serverLoadedW float64 var serverLoadedW float64
var serverLoadedOK bool var serverLoadedOK bool
// sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are // sdrLastStep retains the phase-averaged SDR readings from the last ramp step
// still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling // while GPUs are loaded. Used in the summary instead of re-sampling after the
// after the test when GPUs have already returned to idle. // test when GPUs have already returned to idle.
var sdrLastStep sdrPowerSnapshot var sdrLastStep benchmarkSDRSeriesSummary
// Step 1: reuse single-card calibration result directly. // Step 1: reuse single-card calibration result directly.
if len(result.RecommendedSlotOrder) > 0 { if len(result.RecommendedSlotOrder) > 0 {
@@ -4376,6 +4585,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ramp.ServerLoadedW = w ramp.ServerLoadedW = w
ramp.ServerDeltaW = w - serverIdleW ramp.ServerDeltaW = w - serverIdleW
} }
if singleRun := singleRunSummaryByIndex[firstIdx]; singleRun.AvgFanRPM > 0 {
ramp.AvgFanRPM = singleRun.AvgFanRPM
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
}
if !firstCalib.Completed { if !firstCalib.Completed {
ramp.Status = "FAILED" ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine()))) ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
@@ -4426,7 +4639,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepPowerStopCh := make(chan struct{}) stepPowerStopCh := make(chan struct{})
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval) stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec) stepCalib, stepRestore, stepRows, stepRun := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0) appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
close(stepPowerStopCh) close(stepPowerStopCh)
var stepIPMILoadedW float64 var stepIPMILoadedW float64
@@ -4497,10 +4710,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW)) result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
} }
// Per-step PSU slot snapshot — also used as the authoritative loaded power // Per-step PSU slot readings are averaged over the whole load phase rather
// source when SDR PSU sensors are available (more accurate than DCMI on // than captured as a single end-of-phase snapshot.
// servers where DCMI covers only a subset of installed PSUs). sdrStep := stepRun.LoadedSDR
sdrStep := sampleIPMISDRPowerSensors()
if len(sdrStep.PSUSlots) > 0 { if len(sdrStep.PSUSlots) > 0 {
ramp.PSUSlotReadings = sdrStep.PSUSlots ramp.PSUSlotReadings = sdrStep.PSUSlots
} }
@@ -4518,7 +4730,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 { } else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
ramp.ServerLoadedW = sdrStep.PSUInW ramp.ServerLoadedW = sdrStep.PSUInW
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW)) logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR avg): %.0f W", step, sdrStep.PSUInW))
if step == len(result.RecommendedSlotOrder) { if step == len(result.RecommendedSlotOrder) {
serverLoadedW = sdrStep.PSUInW serverLoadedW = sdrStep.PSUInW
serverLoadedOK = true serverLoadedOK = true
@@ -4526,12 +4738,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
} }
} }
// Fan state at end of ramp step. // Fan values are phase averages over the same load window.
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 { if stepRun.AvgFanRPM > 0 {
ramp.AvgFanRPM = meanFanRPM(fans) ramp.AvgFanRPM = stepRun.AvgFanRPM
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok { ramp.AvgFanDutyCyclePct = stepRun.AvgFanDutyCyclePct
ramp.AvgFanDutyCyclePct = duty
}
} }
// Per-GPU telemetry from this ramp step's calibration. // Per-GPU telemetry from this ramp step's calibration.
@@ -4584,8 +4794,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns. // Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
// Per-slot readings enable correlation with audit HardwarePowerSupply entries. // Per-slot readings enable correlation with audit HardwarePowerSupply entries.
if result.ServerPower != nil { if result.ServerPower != nil {
// Use the SDR snapshot from the last ramp step (GPUs still loaded) rather // Use the SDR phase average from the last ramp step (GPUs still loaded)
// than re-sampling here, which would capture post-test idle state. // rather than re-sampling here, which would capture post-test idle state.
sdrLoaded := sdrLastStep sdrLoaded := sdrLastStep
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
@@ -4605,6 +4815,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.ServerPower.Notes = append(result.ServerPower.Notes, result.ServerPower.Notes = append(result.ServerPower.Notes,
"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; ")) "SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
} }
if sdrLoaded.Samples > 0 {
result.ServerPower.Notes = append(result.ServerPower.Notes,
fmt.Sprintf("Final SDR PSU loaded values are phase averages across %d sample(s) from the last full-load step.", sdrLoaded.Samples))
}
// Detect DCMI partial coverage: direct SDR comparison first, // Detect DCMI partial coverage: direct SDR comparison first,
// ramp heuristic as fallback when SDR PSU sensors are absent. // ramp heuristic as fallback when SDR PSU sensors are absent.
dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) || dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||

View File

@@ -30,10 +30,10 @@ import (
// Sources: // Sources:
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s // - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s // - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 7787 s/GPU // - SATEstimatedNvidiaGPUValidateSec: xFusion v8.6/v8.22 — 7787 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444448 s/GPU // - SATEstimatedNvidiaGPUStressSec: xFusion v8.6/v8.22 — 444448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347348 s/GPU (300 s default + overhead) // - SATEstimatedNvidiaTargetedStressSec: xFusion v8.6/v8.22 — 347348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346351 s/GPU // - SATEstimatedNvidiaTargetedPowerSec: MSI v8.22 / xFusion v8.6 — 346351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous) // - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210384 s / 8 GPU (all simultaneous) // - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210384 s / 8 GPU (all simultaneous)
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 6642 688 s / 8 GPU (all simultaneous) // - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 6642 688 s / 8 GPU (all simultaneous)
@@ -48,15 +48,15 @@ const (
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size). // RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
SATEstimatedMemoryStressSec = 140 SATEstimatedMemoryStressSec = 140
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential. // NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
SATEstimatedNvidiaGPUValidatePerGPUSec = 85 SATEstimatedNvidiaGPUValidateSec = 85
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential. // NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
SATEstimatedNvidiaGPUStressPerGPUSec = 450 SATEstimatedNvidiaGPUStressSec = 450
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential. // NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
SATEstimatedNvidiaTargetedStressPerGPUSec = 350 SATEstimatedNvidiaTargetedStressSec = 350
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential. // NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350 SATEstimatedNvidiaTargetedPowerSec = 350
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU). // NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
SATEstimatedNvidiaPulseTestSec = 5000 SATEstimatedNvidiaPulseTestSec = 5000

View File

@@ -35,9 +35,11 @@ func validateTotalValidateSec(n int) int {
} }
total := platform.SATEstimatedCPUValidateSec + total := platform.SATEstimatedCPUValidateSec +
platform.SATEstimatedMemoryValidateSec + platform.SATEstimatedMemoryValidateSec +
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
platform.SATEstimatedNvidiaInterconnectSec + platform.SATEstimatedNvidiaInterconnectSec +
platform.SATEstimatedNvidiaBandwidthSec platform.SATEstimatedNvidiaBandwidthSec
if n > 0 {
total += platform.SATEstimatedNvidiaGPUValidateSec
}
return total return total
} }
@@ -47,12 +49,14 @@ func validateTotalStressSec(n int) int {
} }
total := platform.SATEstimatedCPUStressSec + total := platform.SATEstimatedCPUStressSec +
platform.SATEstimatedMemoryStressSec + platform.SATEstimatedMemoryStressSec +
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
platform.SATEstimatedNvidiaPulseTestSec + platform.SATEstimatedNvidiaPulseTestSec +
platform.SATEstimatedNvidiaInterconnectSec + platform.SATEstimatedNvidiaInterconnectSec +
platform.SATEstimatedNvidiaBandwidthSec platform.SATEstimatedNvidiaBandwidthSec
if n > 0 {
total += platform.SATEstimatedNvidiaGPUStressSec +
platform.SATEstimatedNvidiaTargetedStressSec +
platform.SATEstimatedNvidiaTargetedPowerSec
}
return total return total
} }
@@ -128,33 +132,16 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks.`, `Runs NVIDIA diagnostics and board inventory checks.`,
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`, `<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
func() string { fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
if n > 0 {
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
validateFmtDur(perV), n, validateFmtDur(perV*n),
validateFmtDur(perS), n, validateFmtDur(perS*n))
}
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
validateFmtDur(perV), validateFmtDur(perS))
}(),
)) + )) +
`<div id="sat-card-nvidia-targeted-stress">` + `<div id="sat-card-nvidia-targeted-stress">` +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA, inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
`<code>dcgmi diag targeted_stress</code>`, `<code>dcgmi diag targeted_stress</code>`,
func() string { "Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
s := "Skipped in Validate. "
if n > 0 {
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
} else {
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
}
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
}(),
)) + )) +
`</div>` + `</div>` +
`<div id="sat-card-nvidia-targeted-power">` + `<div id="sat-card-nvidia-targeted-power">` +
@@ -162,16 +149,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
`<code>dcgmi diag targeted_power</code>`, `<code>dcgmi diag targeted_power</code>`,
func() string { "Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
s := "Skipped in Validate. "
if n > 0 {
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
} else {
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
}
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
}(),
)) + )) +
`</div>` + `</div>` +
`<div id="sat-card-nvidia-pulse">` + `<div id="sat-card-nvidia-pulse">` +
@@ -382,8 +360,8 @@ function runSATWithOverrides(target, overrides) {
return enqueueSATTarget(target, overrides) return enqueueSATTarget(target, overrides)
.then(d => streamSATTask(d.task_id, title, false)); .then(d => streamSATTask(d.task_id, title, false));
} }
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power']; const nvidiaPerGPUTargets = [];
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth']; const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
function satAllGPUIndicesForMulti() { function satAllGPUIndicesForMulti() {
return Promise.resolve(satSelectedGPUIndices()); return Promise.resolve(satSelectedGPUIndices());
} }
@@ -417,40 +395,9 @@ function runNvidiaFabricValidate(target) {
}); });
} }
function runNvidiaValidateSet(target) { function runNvidiaValidateSet(target) {
return loadSatNvidiaGPUs().then(gpus => { const selected = satSelectedGPUIndices();
const selected = satSelectedGPUIndices(); if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0); return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
if (!picked.length) {
throw new Error('Select at least one NVIDIA GPU.');
}
if (picked.length === 1) {
const gpu = picked[0];
return runSATWithOverrides(target, {
gpu_indices: [Number(gpu.index)],
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
});
}
document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + target;
const term = document.getElementById('sat-terminal');
term.textContent = 'Running ' + target + ' one GPU at a time...\n';
const labelBase = satLabels()[target] || ('Validate ' + target);
const runNext = (idx) => {
if (idx >= picked.length) return Promise.resolve();
const gpu = picked[idx];
const gpuLabel = satGPUDisplayName(gpu);
term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
return enqueueSATTarget(target, {
gpu_indices: [Number(gpu.index)],
display_name: labelBase + ' (' + gpuLabel + ')',
}).then(d => {
return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
}).then(function() {
return runNext(idx + 1);
});
};
return runNext(0);
});
} }
function runAMDValidateSet() { function runAMDValidateSet() {
const targets = selectedAMDValidateTargets(); const targets = selectedAMDValidateTargets();

View File

@@ -126,6 +126,37 @@ resolve_iso_version() {
resolve_audit_version resolve_audit_version
} }
sync_builder_workdir() {
src_dir="$1"
dst_dir="$2"
mkdir -p "$dst_dir"
# Historical bug: old workdirs could keep config/bootloaders/grub-pc even
# after the source tree moved to grub-efi only. Remove bootloaders eagerly
# so reused workdirs cannot leak stale templates into a new ISO build.
rm -rf "$dst_dir/config/bootloaders"
rsync -a --delete \
--exclude='cache/' \
--exclude='chroot/' \
--exclude='.build/' \
--exclude='*.iso' \
--exclude='*.packages' \
--exclude='*.contents' \
--exclude='*.files' \
"$src_dir/" "$dst_dir/"
if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
exit 1
fi
if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
exit 1
fi
}
iso_list_files() { iso_list_files() {
iso_path="$1" iso_path="$1"
@@ -466,6 +497,75 @@ validate_iso_memtest() {
echo "=== memtest validation OK ===" echo "=== memtest validation OK ==="
} }
validate_iso_live_boot_entries() {
iso_path="$1"
echo "=== validating live boot entries in ISO ==="
[ -f "$iso_path" ] || {
echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
exit 1
}
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
echo "ERROR: ISO reader unavailable for live boot validation" >&2
exit 1
}
grub_cfg="$(mktemp)"
isolinux_cfg="$(mktemp)"
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
rm -f "$grub_cfg" "$isolinux_cfg"
exit 1
}
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
rm -f "$grub_cfg" "$isolinux_cfg"
exit 1
}
if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
rm -f "$grub_cfg" "$isolinux_cfg"
exit 1
fi
grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
rm -f "$grub_cfg" "$isolinux_cfg"
exit 1
}
grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
echo "ERROR: GRUB toram entry is missing" >&2
rm -f "$grub_cfg" "$isolinux_cfg"
exit 1
}
grep -q 'linux .*boot=live ' "$grub_cfg" || {
echo "ERROR: GRUB live entry is missing boot=live" >&2
rm -f "$grub_cfg" "$isolinux_cfg"
exit 1
}
grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
rm -f "$grub_cfg" "$isolinux_cfg"
exit 1
}
grep -q 'append .*boot=live ' "$isolinux_cfg" || {
echo "ERROR: isolinux live entry is missing boot=live" >&2
rm -f "$grub_cfg" "$isolinux_cfg"
exit 1
}
grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
rm -f "$grub_cfg" "$isolinux_cfg"
exit 1
}
rm -f "$grub_cfg" "$isolinux_cfg"
echo "=== live boot validation OK ==="
}
validate_iso_nvidia_runtime() { validate_iso_nvidia_runtime() {
iso_path="$1" iso_path="$1"
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0 [ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
@@ -558,6 +658,21 @@ extract_live_grub_entry() {
return 0 return 0
} }
load_live_build_append() {
lb_dir="$1"
binary_cfg="$lb_dir/config/binary"
[ -f "$binary_cfg" ] || return 1
# config/binary is generated by live-build and contains shell variable
# assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
# shellcheck disable=SC1090
. "$binary_cfg"
[ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
live_build_append="$LB_BOOTAPPEND_LIVE"
return 0
}
extract_live_isolinux_entry() { extract_live_isolinux_entry() {
cfg="$1" cfg="$1"
isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")" isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
@@ -594,36 +709,15 @@ echo " Hardware Audit LiveCD"
echo "" echo ""
menuentry "EASY-BEE" { menuentry "EASY-BEE" {
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup linux ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd ${initrd} initrd ${initrd}
} }
menuentry "EASY-BEE load to RAM (toram)" { menuentry "EASY-BEE -- load to RAM (toram)" {
linux ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup linux ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd ${initrd} initrd ${initrd}
} }
submenu "EASY-BEE (advanced options) -->" {
menuentry "EASY-BEE — GSP=off" {
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd ${initrd}
}
menuentry "EASY-BEE — KMS (no nomodeset)" {
linux ${kernel} ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd ${initrd}
}
menuentry "EASY-BEE — KMS + GSP=off" {
linux ${kernel} ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd ${initrd}
}
menuentry "EASY-BEE — fail-safe" {
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
initrd ${initrd}
}
}
if [ "\${grub_platform}" = "efi" ]; then if [ "\${grub_platform}" = "efi" ]; then
menuentry "Memory Test (memtest86+)" { menuentry "Memory Test (memtest86+)" {
@@ -699,13 +793,18 @@ enforce_live_build_bootloader_assets() {
grub_dir="$lb_dir/binary/boot/grub" grub_dir="$lb_dir/binary/boot/grub"
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg" isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
if ! load_live_build_append "$lb_dir"; then
echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
live_build_append=""
fi
if [ -f "$grub_cfg" ]; then if [ -f "$grub_cfg" ]; then
if extract_live_grub_entry "$grub_cfg"; then if extract_live_grub_entry "$grub_cfg"; then
mkdir -p "$grub_dir/live-theme" mkdir -p "$grub_dir/live-theme"
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg" cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg" cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/" cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "$grub_append" "$grub_initrd" write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu" echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
else else
echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2 echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
@@ -714,7 +813,7 @@ enforce_live_build_bootloader_assets() {
if [ -f "$isolinux_cfg" ]; then if [ -f "$isolinux_cfg" ]; then
if extract_live_isolinux_entry "$isolinux_cfg"; then if extract_live_isolinux_entry "$isolinux_cfg"; then
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "$isolinux_append" write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu" echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
else else
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2 echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
@@ -1112,15 +1211,7 @@ echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}" mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
# Sync builder config into variant work dir, preserving lb cache. # Sync builder config into variant work dir, preserving lb cache.
rsync -a --delete \ sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"
--exclude='cache/' \
--exclude='chroot/' \
--exclude='.build/' \
--exclude='*.iso' \
--exclude='*.packages' \
--exclude='*.contents' \
--exclude='*.files' \
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
# Share deb package cache across variants. # Share deb package cache across variants.
# Restore: populate work dir cache from shared cache before build. # Restore: populate work dir cache from shared cache before build.
@@ -1411,8 +1502,11 @@ dump_memtest_debug "pre-build" "${LB_DIR}"
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1" run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
echo "=== enforcing canonical bootloader assets ===" echo "=== enforcing canonical bootloader assets ==="
enforce_live_build_bootloader_assets "${LB_DIR}" enforce_live_build_bootloader_assets "${LB_DIR}"
reset_live_build_stage "${LB_DIR}" "binary_checksums"
reset_live_build_stage "${LB_DIR}" "binary_iso"
reset_live_build_stage "${LB_DIR}" "binary_zsync"
run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1" run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "rm -f '${LB_DIR}/live-image-amd64.hybrid.iso' && lb binary_iso 2>&1" run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1" run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
# --- persist deb package cache back to shared location --- # --- persist deb package cache back to shared location ---
@@ -1438,6 +1532,7 @@ if [ -f "$ISO_RAW" ]; then
fi fi
fi fi
validate_iso_memtest "$ISO_RAW" validate_iso_memtest "$ISO_RAW"
validate_iso_live_boot_entries "$ISO_RAW"
validate_iso_nvidia_runtime "$ISO_RAW" validate_iso_nvidia_runtime "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT" cp "$ISO_RAW" "$ISO_OUT"
echo "" echo ""

View File

@@ -23,9 +23,9 @@ insmod serial
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1 serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
insmod gfxterm insmod gfxterm
insmod png
source /boot/grub/theme.cfg
terminal_input console serial terminal_input console serial
terminal_output gfxterm serial terminal_output gfxterm serial
insmod png
source /boot/grub/theme.cfg

View File

@@ -1,47 +1,16 @@
source /boot/grub/config.cfg source /boot/grub/config.cfg
echo ""
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
echo " Hardware Audit LiveCD"
echo ""
menuentry "EASY-BEE" { menuentry "EASY-BEE" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }
submenu "EASY-BEE (advanced options) -->" { menuentry "EASY-BEE -- load to RAM (toram)" {
menuentry "EASY-BEE — load to RAM (toram)" { linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup initrd @INITRD_LIVE@
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE — GSP=off" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE — KMS (no nomodeset)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE — KMS + GSP=off" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE — fail-safe" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
initrd @INITRD_LIVE@
}
} }
if [ "${grub_platform}" = "efi" ]; then if [ "${grub_platform}" = "efi" ]; then
menuentry "Memory Test (memtest86+)" { menuentry "Memory Test (memtest86+)" {
chainloader /boot/memtest86+x64.efi chainloader /boot/memtest86+x64.efi

Binary file not shown.

Before

Width:  |  Height:  |  Size: 70 KiB

After

Width:  |  Height:  |  Size: 78 KiB

View File

@@ -5,12 +5,10 @@ title-text: ""
message-font: "Unifont Regular 16" message-font: "Unifont Regular 16"
terminal-font: "Unifont Regular 16" terminal-font: "Unifont Regular 16"
#bee logo centered, upper third of screen #bee logo - centered, upper third of screen
+ image { + image {
top = 4% top = 4%
left = 50%-200 left = 50%-200
width = 400
height = 400
file = "bee-logo.png" file = "bee-logo.png"
} }
@@ -36,11 +34,11 @@ terminal-font: "Unifont Regular 16"
item_font = "Unifont Regular 16" item_font = "Unifont Regular 16"
selected_item_color= "#f5a800" selected_item_color= "#f5a800"
selected_item_font = "Unifont Regular 16" selected_item_font = "Unifont Regular 16"
item_height = 16 item_height = 20
item_padding = 0 item_padding = 2
item_spacing = 4 item_spacing = 4
icon_width = 0 icon_width = 0
icon_heigh = 0 icon_height = 0
item_icon_space = 0 item_icon_space = 0
} }