Compare commits

..

4 Commits
v8.25 ... v8.29

Author SHA1 Message Date
51b721aeb3 Add real-data duration estimates to benchmark and burn pages
- Add BenchmarkEstimated* constants to benchmark_types.go from _v8 logs
  (Standard Perf ~16 min, Standard Power Fit ~43 min, Stability Perf ~92 min)
- Update benchmark profile dropdown to show Perf / Power Fit timing per profile
- Add timing columns to Method Split table (Standard vs Stability per run type)
- Update burn preset labels to show "N min/GPU (sequential) or N min (parallel)"
- Clarify burn "one by one" description with sequential vs parallel scaling

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-18 10:54:50 +03:00
bac89bb6e5 Add real-data duration estimates to validate tab profiles
- Add SATEstimated* constants to sat.go derived from _v8 production logs,
  with a rule to recalculate them whenever the script changes
- Extend validateInventory with NvidiaGPUCount to make estimates GPU-aware
- Update all validate card duration strings: CPU, memory, storage, NVIDIA GPU,
  targeted stress/power, pulse test, NCCL, nvbandwidth
- Fix nvbandwidth description ("intended to stay short" → actual ~45 min)
- Top-level profile labels show computed total including GPU count

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-18 10:51:15 +03:00
7a618da1f9 Redesign system power chart as stacked per-PSU area chart
- Add PSUReading struct and PSUs []PSUReading to LiveMetricSample
- Sample per-PSU input watts from IPMI SDR entity 10.x (Power Supply)
- Render stacked filled-area SVG chart (one layer per PSU, cumulative total)
- Fall back to single-line chart on systems with ≤1 PSU in SDR

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-18 10:42:00 +03:00
64ae1c0ff0 Sync GRUB and isolinux boot entries; document sync rule
grub-efi/grub.cfg: add KMS+GSP=off entry (was in isolinux, missing in GRUB)

isolinux/live.cfg.in: add full standard param set to all entries
(net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
numa_balancing=disable nowatchdog nosoftlockup) to match grub-efi

bible-local/docs/iso-build-rules.md: add bootloader sync rule documenting
that grub-efi and isolinux must be kept in sync manually, listing canonical
entries and standard param set, and noting the grub-pc/grub-efi history.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-18 10:32:16 +03:00
11 changed files with 544 additions and 93 deletions

View File

@@ -43,6 +43,31 @@ const (
NvidiaBenchmarkProfileOvernight = "overnight" NvidiaBenchmarkProfileOvernight = "overnight"
) )
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
// re-measure from actual task logs and update the constants here.
//
// Sources:
// - BenchmarkEstimatedPerfStandardSec: MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
// - BenchmarkEstimatedPowerStabilitySec: xFusion v8.17/v8.22 ramp 1-8: 1977-2002 s
const (
// Performance Benchmark (bee-gpu-burn).
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
// Sequential per-GPU mode scales approximately linearly.
BenchmarkEstimatedPerfStandardSec = 960 // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
BenchmarkEstimatedPowerStabilitySec = 2000 // ~33 min; stability profile converges faster (longer steady → faster convergence)
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
)
type NvidiaBenchmarkOptions struct { type NvidiaBenchmarkOptions struct {
Profile string Profile string
SizeMB int SizeMB int

View File

@@ -18,11 +18,19 @@ type LiveMetricSample struct {
Fans []FanReading `json:"fans"` Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"` Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"` PowerW float64 `json:"power_w"`
PSUs []PSUReading `json:"psus,omitempty"`
CPULoadPct float64 `json:"cpu_load_pct"` CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"` MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"` GPUs []GPUMetricRow `json:"gpus"`
} }
// PSUReading is a per-slot power supply input power reading.
type PSUReading struct {
Slot int `json:"slot"`
Name string `json:"name"`
PowerW float64 `json:"power_w"`
}
// TempReading is a named temperature sensor value. // TempReading is a named temperature sensor value.
type TempReading struct { type TempReading struct {
Name string `json:"name"` Name string `json:"name"`
@@ -57,6 +65,9 @@ func SampleLiveMetrics() LiveMetricSample {
// System power — returns 0 if unavailable // System power — returns 0 if unavailable
s.PowerW = sampleSystemPower() s.PowerW = sampleSystemPower()
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
s.PSUs = samplePSUPower()
// CPU load — from /proc/stat // CPU load — from /proc/stat
s.CPULoadPct = sampleCPULoadPct() s.CPULoadPct = sampleCPULoadPct()
@@ -326,3 +337,65 @@ func compactAmbientTempName(chip, name string) string {
} }
return chip + " / " + name return chip + " / " + name
} }
// samplePSUPower reads per-PSU input power via IPMI SDR.
// It parses `ipmitool sdr elist full` output looking for Power Supply entity
// sensors (entity ID "10.N") that report a value in Watts.
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
func samplePSUPower() []PSUReading {
out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
if err != nil || len(out) == 0 {
return nil
}
// map slot → reading (keep highest-watt value per slot in case of duplicates)
type entry struct {
name string
powerW float64
}
bySlot := map[int]entry{}
for _, line := range strings.Split(string(out), "\n") {
parts := strings.Split(line, "|")
if len(parts) < 5 {
continue
}
entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
if !strings.HasPrefix(entityID, "10.") {
continue // not a Power Supply entity
}
slotStr := strings.TrimPrefix(entityID, "10.")
slot, err := strconv.Atoi(slotStr)
if err != nil {
continue
}
valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
if !strings.Contains(strings.ToLower(valueField), "watts") {
continue
}
valueFields := strings.Fields(valueField)
if len(valueFields) < 2 {
continue
}
w, err := strconv.ParseFloat(valueFields[0], 64)
if err != nil || w <= 0 {
continue
}
sensorName := strings.TrimSpace(parts[0])
if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
bySlot[slot] = entry{name: sensorName, powerW: w}
}
}
if len(bySlot) == 0 {
return nil
}
slots := make([]int, 0, len(bySlot))
for s := range bySlot {
slots = append(slots, s)
}
sort.Ints(slots)
psus := make([]PSUReading, 0, len(slots))
for _, s := range slots {
e := bySlot[s]
psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
}
return psus
}

View File

@@ -20,6 +20,54 @@ import (
"time" "time"
) )
// Estimated wall-clock durations for each SAT/validate test, derived from real
// production logs in _benchmark/_v8/.
//
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
// the corresponding Run*Pack function change, re-measure the wall-clock duration
// from actual task logs and update the matching constant here.
//
// Sources:
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 7787 s/GPU
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444448 s/GPU
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347348 s/GPU (300 s default + overhead)
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346351 s/GPU
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210384 s / 8 GPU (all simultaneous)
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 6642 688 s / 8 GPU (all simultaneous)
const (
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
SATEstimatedCPUValidateSec = 65
// CPU stress: stress-ng 1800 s (stress mode default).
SATEstimatedCPUStressSec = 1800
// RAM: memtester 256 MB / 1 pass.
SATEstimatedMemoryValidateSec = 70
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
SATEstimatedMemoryStressSec = 140
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
SATEstimatedNvidiaGPUStressPerGPUSec = 450
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
SATEstimatedNvidiaPulseTestSec = 5000
// NCCL all_reduce_perf, all GPUs simultaneously.
SATEstimatedNvidiaInterconnectSec = 300
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
SATEstimatedNvidiaBandwidthSec = 2700
)
var ( var (
satExecCommand = exec.Command satExecCommand = exec.Command
satLookPath = exec.LookPath satLookPath = exec.LookPath

View File

@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
return out return out
} }
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
// is visually "stacked" on top of the previous one. Intended for multi-PSU
// power charts where the filled area of each PSU shows its individual
// contribution and the total height equals the combined draw.
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
if pointCount == 0 {
pointCount = 1
labels = []string{""}
times = []time.Time{{}}
}
if len(labels) < pointCount {
padded := make([]string, pointCount)
copy(padded, labels)
labels = padded
}
if len(times) < pointCount {
times = synthesizeChartTimes(times, pointCount)
}
for i := range datasets {
if len(datasets[i]) == 0 {
datasets[i] = make([]float64, pointCount)
}
}
times, datasets = downsampleTimeSeries(times, datasets, 1400)
pointCount = len(times)
// Build cumulative sums per time point.
cumulative := make([][]float64, len(datasets)+1)
for i := range cumulative {
cumulative[i] = make([]float64, pointCount)
}
for i, ds := range datasets {
for j, v := range ds {
cumulative[i+1][j] = cumulative[i][j] + v
}
}
// Scale is based on the total (top cumulative row).
total := cumulative[len(cumulative)-1]
yMin := floatPtr(0)
if yMax == nil {
yMax = autoMax120(total)
}
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
legendItems := make([]metricChartSeries, len(datasets))
for i, name := range names {
color := metricChartPalette[i%len(metricChartPalette)]
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
}
// Stats label from totals.
statsLabel := chartStatsLabel([][]float64{total})
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
start, end := chartTimeBounds(times)
var b strings.Builder
writeSVGOpen(&b, layout.Width, layout.Height)
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
writeTimelineIdleSpans(&b, layout, start, end, timeline)
writeVerticalGrid(&b, layout, times, pointCount, 8)
writeHorizontalGrid(&b, layout, scale)
writeTimelineBoundaries(&b, layout, start, end, timeline)
writePlotBorder(&b, layout)
writeSingleAxisY(&b, layout, scale)
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
// Draw stacked areas from top to bottom so lower layers are visible.
for i := len(datasets) - 1; i >= 0; i-- {
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
}
// Draw border polylines on top.
for i := len(datasets) - 1; i >= 0; i-- {
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
}
writeLegend(&b, layout, legendItems)
writeSVGClose(&b)
return []byte(b.String()), nil
}
// writeStackedArea draws a filled polygon between two cumulative value arrays
// (baseline and top), using the given color at 55% opacity.
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
n := len(top)
if n == 0 {
return
}
if len(baseline) < n {
baseline = make([]float64, n)
}
// Forward path along top values, then backward along baseline values.
var points strings.Builder
for i := 0; i < n; i++ {
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
if i > 0 {
points.WriteByte(' ')
}
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
points.WriteByte(',')
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
}
for i := n - 1; i >= 0; i-- {
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
points.WriteByte(' ')
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
points.WriteByte(',')
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
}
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
}
func writeSVGOpen(b *strings.Builder, width, height int) { func writeSVGOpen(b *strings.Builder, width, height int) {
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height) fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
} }

View File

@@ -1378,15 +1378,64 @@ setInterval(loadMetricsLayout, 5000);
// ── Validate (Acceptance Tests) ─────────────────────────────────────────────── // ── Validate (Acceptance Tests) ───────────────────────────────────────────────
type validateInventory struct { type validateInventory struct {
CPU string CPU string
Memory string Memory string
Storage string Storage string
NVIDIA string NVIDIA string
AMD string AMD string
NvidiaGPUCount int
AMDGPUCount int
}
// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string.
func validateFmtDur(secs int) string {
if secs < 120 {
return fmt.Sprintf("~%d s", secs)
}
mins := (secs + 29) / 60
return fmt.Sprintf("~%d min", mins)
}
// validateTotalValidateSec returns the estimated wall-clock duration of
// "Validate one by one" in Validate mode for n NVIDIA GPUs.
func validateTotalValidateSec(n int) int {
if n < 0 {
n = 0
}
total := platform.SATEstimatedCPUValidateSec +
platform.SATEstimatedMemoryValidateSec +
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
platform.SATEstimatedNvidiaInterconnectSec +
platform.SATEstimatedNvidiaBandwidthSec
return total
}
// validateTotalStressSec returns the estimated wall-clock duration of
// "Validate one by one" in Stress mode for n NVIDIA GPUs.
func validateTotalStressSec(n int) int {
if n < 0 {
n = 0
}
total := platform.SATEstimatedCPUStressSec +
platform.SATEstimatedMemoryStressSec +
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
platform.SATEstimatedNvidiaPulseTestSec +
platform.SATEstimatedNvidiaInterconnectSec +
platform.SATEstimatedNvidiaBandwidthSec
return total
} }
func renderValidate(opts HandlerOptions) string { func renderValidate(opts HandlerOptions) string {
inv := loadValidateInventory(opts) inv := loadValidateInventory(opts)
n := inv.NvidiaGPUCount
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
gpuNote := ""
if n > 0 {
gpuNote = fmt.Sprintf(" (%d GPU)", n)
}
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div> return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p> <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
@@ -1396,10 +1445,10 @@ func renderValidate(opts HandlerOptions) string {
<div class="validate-profile-col"> <div class="validate-profile-col">
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div> <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label> <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~3060 min)</span></label> <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
</div> </div>
<div class="validate-profile-col validate-profile-action"> <div class="validate-profile-col validate-profile-action">
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~515 min total); Stress is thorough (~3060 min total).</p> <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button> <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
<div style="margin-top:12px"> <div style="margin-top:12px">
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span> <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
@@ -1413,19 +1462,19 @@ func renderValidate(opts HandlerOptions) string {
inv.CPU, inv.CPU,
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`, `Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`, `<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
`60s in Validate, 30 min in Stress.`, validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
)) + )) +
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody( renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
inv.Memory, inv.Memory,
`Runs a RAM validation pass and records memory state around the test.`, `Runs a RAM validation pass and records memory state around the test.`,
`<code>free</code>, <code>memtester</code>`, `<code>free</code>, <code>memtester</code>`,
`256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`, validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
)) + )) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody( renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage, inv.Storage,
`Scans all storage devices and runs the matching health or self-test path for each device type.`, `Scans all storage devices and runs the matching health or self-test path for each device type.`,
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`, `<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
`Short self-test in Validate, extended self-test in Stress.`, `Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
)) + )) +
`</div> `</div>
<div style="height:1px;background:var(--border);margin:16px 0"></div> <div style="height:1px;background:var(--border);margin:16px 0"></div>
@@ -1450,14 +1499,33 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks.`, `Runs NVIDIA diagnostics and board inventory checks.`,
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`, `<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`, func() string {
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
if n > 0 {
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
validateFmtDur(perV), n, validateFmtDur(perV*n),
validateFmtDur(perS), n, validateFmtDur(perS*n))
}
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
validateFmtDur(perV), validateFmtDur(perS))
}(),
)) + )) +
`<div id="sat-card-nvidia-targeted-stress">` + `<div id="sat-card-nvidia-targeted-stress">` +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA, inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
`<code>dcgmi diag targeted_stress</code>`, `<code>dcgmi diag targeted_stress</code>`,
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`, func() string {
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
s := "Skipped in Validate. "
if n > 0 {
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
} else {
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
}
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
}(),
)) + )) +
`</div>` + `</div>` +
`<div id="sat-card-nvidia-targeted-power">` + `<div id="sat-card-nvidia-targeted-power">` +
@@ -1465,7 +1533,16 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
`<code>dcgmi diag targeted_power</code>`, `<code>dcgmi diag targeted_power</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`, func() string {
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
s := "Skipped in Validate. "
if n > 0 {
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
} else {
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
}
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
}(),
)) + )) +
`</div>` + `</div>` +
`<div id="sat-card-nvidia-pulse">` + `<div id="sat-card-nvidia-pulse">` +
@@ -1473,7 +1550,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`, `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
`<code>dcgmi diag pulse_test</code>`, `<code>dcgmi diag pulse_test</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`, `Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) + )) +
`</div>` + `</div>` +
`<div id="sat-card-nvidia-interconnect">` + `<div id="sat-card-nvidia-interconnect">` +
@@ -1481,7 +1558,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`, `<code>all_reduce_perf</code> (NCCL tests)`,
`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`, `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
)) + )) +
`</div>` + `</div>` +
`<div id="sat-card-nvidia-bandwidth">` + `<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1566,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA, inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`, `<code>nvbandwidth</code>`,
`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`, `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
)) + )) +
`</div>` + `</div>` +
`</div> `</div>
@@ -1922,6 +1999,8 @@ func loadValidateInventory(opts HandlerOptions) validateInventory {
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device") out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU") out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU") out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
out.NvidiaGPUCount = nvidiaTotal
out.AMDGPUCount = amdTotal
return out return out
} }
@@ -2031,9 +2110,9 @@ func renderBenchmark(opts HandlerOptions) string {
<div class="form-row"> <div class="form-row">
<label>Profile</label> <label>Profile</label>
<select id="benchmark-profile"> <select id="benchmark-profile">
<option value="standard" selected>Standard — about 15 minutes</option> <option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
<option value="stability">Stability — 1 to 2 hours</option> <option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
<option value="overnight">Overnight — 8 hours</option> <option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
</select> </select>
</div> </div>
<div class="form-row"> <div class="form-row">
@@ -2073,11 +2152,11 @@ func renderBenchmark(opts HandlerOptions) string {
<div class="card-body"> <div class="card-body">
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p> <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
<table> <table>
<tr><th>Run Type</th><th>Engine</th><th>Question</th></tr> <tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td></tr> <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
<tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td></tr> <tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
</table> </table>
<p style="font-size:12px;color:var(--muted);margin-top:10px">Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p> <p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 48 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
</div> </div>
</div> </div>
</div> </div>
@@ -2566,13 +2645,13 @@ func renderBurn() string {
<div class="card-body burn-profile-body"> <div class="card-body burn-profile-body">
<div class="burn-profile-col"> <div class="burn-profile-col">
<div class="form-row" style="margin:0 0 8px"><label>Preset</label></div> <div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — quick check (~5 min)</span></label> <label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 hour</span></label> <label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label> <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
</div> </div>
<div class="burn-profile-col burn-profile-action"> <div class="burn-profile-col burn-profile-action">
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button> <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p> <p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
</div> </div>
<div class="burn-profile-col burn-profile-action"> <div class="burn-profile-col burn-profile-action">
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button> <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>

View File

@@ -575,12 +575,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
} }
timeline := metricsTimelineSegments(samples, time.Now()) timeline := metricsTimelineSegments(samples, time.Now())
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" { if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline) var overviewOk bool
var buf []byte
buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
if err != nil { if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError) http.Error(w, err.Error(), http.StatusInternalServerError)
return return
} }
if !ok { if !overviewOk {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable) http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return return
} }
@@ -589,23 +591,37 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf) _, _ = w.Write(buf)
return return
} }
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples) datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
if !ok { if !ok {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable) http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return return
} }
buf, err := renderMetricChartSVG( var buf []byte
title, if stacked {
labels, buf, err = renderStackedMetricChartSVG(
sampleTimes(samples), title,
datasets, labels,
names, sampleTimes(samples),
yMin, datasets,
yMax, names,
chartCanvasHeightForPath(path, len(names)), yMax,
timeline, chartCanvasHeightForPath(path, len(names)),
) timeline,
)
} else {
buf, err = renderMetricChartSVG(
title,
labels,
sampleTimes(samples),
datasets,
names,
yMin,
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
}
if err != nil { if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError) http.Error(w, err.Error(), http.StatusInternalServerError)
return return
@@ -615,12 +631,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf) _, _ = w.Write(buf)
} }
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) { func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
var datasets [][]float64 labels = sampleTimeLabels(samples)
var names []string
var title string
var yMin, yMax *float64
labels := sampleTimeLabels(samples)
switch { switch {
case path == "server-load": case path == "server-load":
@@ -656,15 +668,41 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
case path == "server-power": case path == "server-power":
title = "System Power" title = "System Power"
power := make([]float64, len(samples)) // Use per-PSU stacked chart when PSU SDR data is available.
for i, s := range samples { // Collect the union of PSU slots seen across all samples.
power[i] = s.PowerW psuSlots := psuSlotsFromSamples(samples)
if len(psuSlots) > 1 {
// Build one dataset per PSU slot.
psuDatasets := make([][]float64, len(psuSlots))
psuNames := make([]string, len(psuSlots))
for si, slot := range psuSlots {
ds := make([]float64, len(samples))
for i, s := range samples {
for _, psu := range s.PSUs {
if psu.Slot == slot {
ds[i] = psu.PowerW
break
}
}
}
psuDatasets[si] = normalizePowerSeries(ds)
psuNames[si] = fmt.Sprintf("PSU %d", slot)
}
datasets = psuDatasets
names = psuNames
stacked = true
yMax = autoMax120(psuStackedTotal(psuDatasets))
} else {
power := make([]float64, len(samples))
for i, s := range samples {
power[i] = s.PowerW
}
power = normalizePowerSeries(power)
datasets = [][]float64{power}
names = []string{"Power W"}
yMin = floatPtr(0)
yMax = autoMax120(power)
} }
power = normalizePowerSeries(power)
datasets = [][]float64{power}
names = []string{"Power W"}
yMin = floatPtr(0)
yMax = autoMax120(power)
case path == "server-fans": case path == "server-fans":
title = "Fan RPM" title = "Fan RPM"
@@ -707,7 +745,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
case strings.HasPrefix(path, "gpu/"): case strings.HasPrefix(path, "gpu/"):
idx, sub, ok := parseGPUChartPath(path) idx, sub, ok := parseGPUChartPath(path)
if !ok { if !ok {
return nil, nil, nil, "", nil, nil, false return nil, nil, nil, "", nil, nil, false, false
} }
switch sub { switch sub {
case "load": case "load":
@@ -715,7 +753,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct }) util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct }) mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
if util == nil && mem == nil { if util == nil && mem == nil {
return nil, nil, nil, "", nil, nil, false return nil, nil, nil, "", nil, nil, false, false
} }
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))} datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
names = []string{"Load %", "Mem %"} names = []string{"Load %", "Mem %"}
@@ -725,7 +763,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
title = gpuDisplayLabel(idx) + " Temperature" title = gpuDisplayLabel(idx) + " Temperature"
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC }) temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
if temp == nil { if temp == nil {
return nil, nil, nil, "", nil, nil, false return nil, nil, nil, "", nil, nil, false, false
} }
datasets = [][]float64{temp} datasets = [][]float64{temp}
names = []string{"Temp °C"} names = []string{"Temp °C"}
@@ -735,7 +773,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
title = gpuDisplayLabel(idx) + " Core Clock" title = gpuDisplayLabel(idx) + " Core Clock"
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz }) clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
if clock == nil { if clock == nil {
return nil, nil, nil, "", nil, nil, false return nil, nil, nil, "", nil, nil, false, false
} }
datasets = [][]float64{clock} datasets = [][]float64{clock}
names = []string{"Core Clock MHz"} names = []string{"Core Clock MHz"}
@@ -744,7 +782,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
title = gpuDisplayLabel(idx) + " Memory Clock" title = gpuDisplayLabel(idx) + " Memory Clock"
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz }) clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
if clock == nil { if clock == nil {
return nil, nil, nil, "", nil, nil, false return nil, nil, nil, "", nil, nil, false, false
} }
datasets = [][]float64{clock} datasets = [][]float64{clock}
names = []string{"Memory Clock MHz"} names = []string{"Memory Clock MHz"}
@@ -753,7 +791,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
title = gpuDisplayLabel(idx) + " Power" title = gpuDisplayLabel(idx) + " Power"
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW }) power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
if power == nil { if power == nil {
return nil, nil, nil, "", nil, nil, false return nil, nil, nil, "", nil, nil, false, false
} }
datasets = [][]float64{power} datasets = [][]float64{power}
names = []string{"Power W"} names = []string{"Power W"}
@@ -761,10 +799,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
} }
default: default:
return nil, nil, nil, "", nil, nil, false return nil, nil, nil, "", nil, nil, false, false
} }
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0 return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
} }
func parseGPUChartPath(path string) (idx int, sub string, ok bool) { func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
@@ -930,6 +968,37 @@ func normalizePowerSeries(ds []float64) []float64 {
return out return out
} }
// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
seen := map[int]struct{}{}
for _, s := range samples {
for _, p := range s.PSUs {
seen[p.Slot] = struct{}{}
}
}
slots := make([]int, 0, len(seen))
for s := range seen {
slots = append(slots, s)
}
sort.Ints(slots)
return slots
}
// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
func psuStackedTotal(datasets [][]float64) []float64 {
if len(datasets) == 0 {
return nil
}
n := len(datasets[0])
total := make([]float64, n)
for _, ds := range datasets {
for i, v := range ds {
total[i] += v
}
}
return total
}
func normalizeFanSeries(ds []float64) []float64 { func normalizeFanSeries(ds []float64) []float64 {
if len(ds) == 0 { if len(ds) == 0 {
return nil return nil

View File

@@ -120,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
}, },
} }
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples) datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
if !ok { if !ok {
t.Fatal("chartDataFromSamples returned ok=false") t.Fatal("chartDataFromSamples returned ok=false")
} }
@@ -164,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
}, },
} }
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples) datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
if !ok { if !ok {
t.Fatal("chartDataFromSamples returned ok=false") t.Fatal("chartDataFromSamples returned ok=false")
} }
@@ -209,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
}, },
} }
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples) datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
if !ok { if !ok {
t.Fatal("gpu-all-clock returned ok=false") t.Fatal("gpu-all-clock returned ok=false")
} }
@@ -754,9 +754,9 @@ func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
body := rec.Body.String() body := rec.Body.String()
for _, needle := range []string{ for _, needle := range []string{
`NVIDIA Interconnect (NCCL)`, `NVIDIA Interconnect (NCCL)`,
`Runs in Validate and Stress.`, `Validate and Stress:`,
`NVIDIA Bandwidth (NVBandwidth)`, `NVIDIA Bandwidth (NVBandwidth)`,
`Intended to stay short enough for Validate.`, `nvbandwidth runs all built-in tests without a time limit`,
} { } {
if !strings.Contains(body, needle) { if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body) t.Fatalf("validate page missing %q: %s", needle, body)

View File

@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
} }
return gpuDisplayLabel(idx) + " Overview", buf, true return gpuDisplayLabel(idx) + " Overview", buf, true
} }
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples) datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
if !ok { if !ok {
return "", nil, false return "", nil, false
} }
buf, err := renderMetricChartSVG( var buf []byte
title, var err error
labels, if stacked {
sampleTimes(samples), buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
datasets, } else {
names, buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
yMin, }
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
if err != nil { if err != nil {
return "", nil, false return "", nil, false
} }

View File

@@ -15,6 +15,41 @@ This applies to:
- `iso/builder/config/package-lists/*.list.chroot` - `iso/builder/config/package-lists/*.list.chroot`
- Any package referenced in bootloader configs, hooks, or overlay scripts - Any package referenced in bootloader configs, hooks, or overlay scripts
## Bootloader sync rule
The ISO has two independent bootloader configs that must be kept in sync manually:
| File | Used by |
|------|---------|
| `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
| `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
live-build does NOT derive one from the other. Any new boot entry, kernel parameter
change, or new mode added to one file must be manually mirrored in the other.
**Canonical entry list** (both files must have all of these):
| Label | Key params |
|-------|-----------|
| normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
| load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
| GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
| KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
| KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
| fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
**Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
```
net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
nowatchdog nosoftlockup
```
(fail-safe is the exception — it deliberately uses minimal params.)
**Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
## Memtest rule ## Memtest rule
Do not assume live-build's built-in memtest integration is sufficient for `bee`. Do not assume live-build's built-in memtest integration is sufficient for `bee`.

View File

@@ -31,6 +31,11 @@ submenu "EASY-BEE (advanced options) -->" {
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }
menuentry "EASY-BEE — KMS + GSP=off" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE — fail-safe" { menuentry "EASY-BEE — fail-safe" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0 linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@

View File

@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
menu default menu default
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
label live-@FLAVOUR@-kms
menu label EASY-BEE (^graphics/KMS)
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
label live-@FLAVOUR@-toram label live-@FLAVOUR@-toram
menu label EASY-BEE (^load to RAM) menu label EASY-BEE (^load to RAM)
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
label live-@FLAVOUR@-gsp-off label live-@FLAVOUR@-gsp-off
menu label EASY-BEE (^NVIDIA GSP=off) menu label EASY-BEE (^NVIDIA GSP=off)
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
label live-@FLAVOUR@-kms-gsp-off label live-@FLAVOUR@-kms
menu label EASY-BEE (g^raphics/KMS, GSP=off) menu label EASY-BEE (^KMS, no nomodeset)
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
label live-@FLAVOUR@-kms-gsp-off
menu label EASY-BEE (KMS, ^GSP=off)
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
label live-@FLAVOUR@-failsafe label live-@FLAVOUR@-failsafe
menu label EASY-BEE (^fail-safe) menu label EASY-BEE (^fail-safe)
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
label memtest label memtest
menu label ^Memory Test (memtest86+) menu label ^Memory Test (memtest86+)