Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| bac89bb6e5 | |||
| 7a618da1f9 | |||
| 64ae1c0ff0 | |||
| 49050ca717 | |||
| 5ba72ab315 | |||
| 63363e9629 |
@@ -140,10 +140,40 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
|||||||
}
|
}
|
||||||
|
|
||||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||||
if err != nil || len(squashfsFiles) == 0 {
|
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
|
||||||
|
dstDir := installToRAMDir
|
||||||
|
|
||||||
|
// If the source medium is unavailable, check whether a previous run already
|
||||||
|
// produced a complete copy in RAM. If so, skip the copy phase and proceed
|
||||||
|
// directly to the loop-rebind / bind-mount steps.
|
||||||
|
if !sourceAvailable {
|
||||||
|
copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||||
|
if len(copiedFiles) > 0 {
|
||||||
|
log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
|
||||||
|
// Proceed to rebind with the already-copied files.
|
||||||
|
for _, dst := range copiedFiles {
|
||||||
|
base := filepath.Base(dst)
|
||||||
|
// Re-associate the loop device that was originally backed by the
|
||||||
|
// source file (now gone); find it by the old source path pattern.
|
||||||
|
srcGuess := "/run/live/medium/live/" + base
|
||||||
|
loopDev, lerr := findLoopForFile(srcGuess)
|
||||||
|
if lerr != nil {
|
||||||
|
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
|
||||||
|
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
|
||||||
|
} else {
|
||||||
|
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
goto bindMedium
|
||||||
|
}
|
||||||
|
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
free := freeMemBytes()
|
free := freeMemBytes()
|
||||||
var needed int64
|
var needed int64
|
||||||
for _, sf := range squashfsFiles {
|
for _, sf := range squashfsFiles {
|
||||||
@@ -158,8 +188,8 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
|||||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||||
humanBytes(needed+headroom), humanBytes(free))
|
humanBytes(needed+headroom), humanBytes(free))
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
dstDir := installToRAMDir
|
|
||||||
if state.CopyPresent {
|
if state.CopyPresent {
|
||||||
log("Removing stale partial RAM copy before retry...")
|
log("Removing stale partial RAM copy before retry...")
|
||||||
}
|
}
|
||||||
@@ -199,6 +229,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bindMedium:
|
||||||
log("Copying remaining medium files...")
|
log("Copying remaining medium files...")
|
||||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||||
|
|||||||
@@ -18,11 +18,19 @@ type LiveMetricSample struct {
|
|||||||
Fans []FanReading `json:"fans"`
|
Fans []FanReading `json:"fans"`
|
||||||
Temps []TempReading `json:"temps"`
|
Temps []TempReading `json:"temps"`
|
||||||
PowerW float64 `json:"power_w"`
|
PowerW float64 `json:"power_w"`
|
||||||
|
PSUs []PSUReading `json:"psus,omitempty"`
|
||||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||||
MemLoadPct float64 `json:"mem_load_pct"`
|
MemLoadPct float64 `json:"mem_load_pct"`
|
||||||
GPUs []GPUMetricRow `json:"gpus"`
|
GPUs []GPUMetricRow `json:"gpus"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PSUReading is a per-slot power supply input power reading.
|
||||||
|
type PSUReading struct {
|
||||||
|
Slot int `json:"slot"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
PowerW float64 `json:"power_w"`
|
||||||
|
}
|
||||||
|
|
||||||
// TempReading is a named temperature sensor value.
|
// TempReading is a named temperature sensor value.
|
||||||
type TempReading struct {
|
type TempReading struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
@@ -57,6 +65,9 @@ func SampleLiveMetrics() LiveMetricSample {
|
|||||||
// System power — returns 0 if unavailable
|
// System power — returns 0 if unavailable
|
||||||
s.PowerW = sampleSystemPower()
|
s.PowerW = sampleSystemPower()
|
||||||
|
|
||||||
|
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||||
|
s.PSUs = samplePSUPower()
|
||||||
|
|
||||||
// CPU load — from /proc/stat
|
// CPU load — from /proc/stat
|
||||||
s.CPULoadPct = sampleCPULoadPct()
|
s.CPULoadPct = sampleCPULoadPct()
|
||||||
|
|
||||||
@@ -326,3 +337,65 @@ func compactAmbientTempName(chip, name string) string {
|
|||||||
}
|
}
|
||||||
return chip + " / " + name
|
return chip + " / " + name
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||||
|
// It parses `ipmitool sdr elist full` output looking for Power Supply entity
|
||||||
|
// sensors (entity ID "10.N") that report a value in Watts.
|
||||||
|
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||||
|
func samplePSUPower() []PSUReading {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// map slot → reading (keep highest-watt value per slot in case of duplicates)
|
||||||
|
type entry struct {
|
||||||
|
name string
|
||||||
|
powerW float64
|
||||||
|
}
|
||||||
|
bySlot := map[int]entry{}
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 5 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
|
||||||
|
if !strings.HasPrefix(entityID, "10.") {
|
||||||
|
continue // not a Power Supply entity
|
||||||
|
}
|
||||||
|
slotStr := strings.TrimPrefix(entityID, "10.")
|
||||||
|
slot, err := strconv.Atoi(slotStr)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
|
||||||
|
if !strings.Contains(strings.ToLower(valueField), "watts") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
valueFields := strings.Fields(valueField)
|
||||||
|
if len(valueFields) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
w, err := strconv.ParseFloat(valueFields[0], 64)
|
||||||
|
if err != nil || w <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sensorName := strings.TrimSpace(parts[0])
|
||||||
|
if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
|
||||||
|
bySlot[slot] = entry{name: sensorName, powerW: w}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(bySlot) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
slots := make([]int, 0, len(bySlot))
|
||||||
|
for s := range bySlot {
|
||||||
|
slots = append(slots, s)
|
||||||
|
}
|
||||||
|
sort.Ints(slots)
|
||||||
|
psus := make([]PSUReading, 0, len(slots))
|
||||||
|
for _, s := range slots {
|
||||||
|
e := bySlot[s]
|
||||||
|
psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
|
||||||
|
}
|
||||||
|
return psus
|
||||||
|
}
|
||||||
|
|||||||
@@ -20,6 +20,54 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Estimated wall-clock durations for each SAT/validate test, derived from real
|
||||||
|
// production logs in _benchmark/_v8/.
|
||||||
|
//
|
||||||
|
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
|
||||||
|
// the corresponding Run*Pack function change, re-measure the wall-clock duration
|
||||||
|
// from actual task logs and update the matching constant here.
|
||||||
|
//
|
||||||
|
// Sources:
|
||||||
|
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||||
|
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||||
|
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
||||||
|
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
||||||
|
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
||||||
|
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
||||||
|
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||||
|
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||||
|
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||||
|
const (
|
||||||
|
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
|
||||||
|
SATEstimatedCPUValidateSec = 65
|
||||||
|
// CPU stress: stress-ng 1800 s (stress mode default).
|
||||||
|
SATEstimatedCPUStressSec = 1800
|
||||||
|
|
||||||
|
// RAM: memtester 256 MB / 1 pass.
|
||||||
|
SATEstimatedMemoryValidateSec = 70
|
||||||
|
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||||
|
SATEstimatedMemoryStressSec = 140
|
||||||
|
|
||||||
|
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
||||||
|
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
||||||
|
|
||||||
|
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
||||||
|
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
||||||
|
|
||||||
|
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||||
|
SATEstimatedNvidiaPulseTestSec = 5000
|
||||||
|
|
||||||
|
// NCCL all_reduce_perf, all GPUs simultaneously.
|
||||||
|
SATEstimatedNvidiaInterconnectSec = 300
|
||||||
|
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
|
||||||
|
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
|
||||||
|
SATEstimatedNvidiaBandwidthSec = 2700
|
||||||
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
satExecCommand = exec.Command
|
satExecCommand = exec.Command
|
||||||
satLookPath = exec.LookPath
|
satLookPath = exec.LookPath
|
||||||
|
|||||||
@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
|
||||||
|
// is visually "stacked" on top of the previous one. Intended for multi-PSU
|
||||||
|
// power charts where the filled area of each PSU shows its individual
|
||||||
|
// contribution and the total height equals the combined draw.
|
||||||
|
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
if pointCount == 0 {
|
||||||
|
pointCount = 1
|
||||||
|
labels = []string{""}
|
||||||
|
times = []time.Time{{}}
|
||||||
|
}
|
||||||
|
if len(labels) < pointCount {
|
||||||
|
padded := make([]string, pointCount)
|
||||||
|
copy(padded, labels)
|
||||||
|
labels = padded
|
||||||
|
}
|
||||||
|
if len(times) < pointCount {
|
||||||
|
times = synthesizeChartTimes(times, pointCount)
|
||||||
|
}
|
||||||
|
for i := range datasets {
|
||||||
|
if len(datasets[i]) == 0 {
|
||||||
|
datasets[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||||
|
pointCount = len(times)
|
||||||
|
|
||||||
|
// Build cumulative sums per time point.
|
||||||
|
cumulative := make([][]float64, len(datasets)+1)
|
||||||
|
for i := range cumulative {
|
||||||
|
cumulative[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
for i, ds := range datasets {
|
||||||
|
for j, v := range ds {
|
||||||
|
cumulative[i+1][j] = cumulative[i][j] + v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scale is based on the total (top cumulative row).
|
||||||
|
total := cumulative[len(cumulative)-1]
|
||||||
|
yMin := floatPtr(0)
|
||||||
|
if yMax == nil {
|
||||||
|
yMax = autoMax120(total)
|
||||||
|
}
|
||||||
|
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
|
||||||
|
|
||||||
|
legendItems := make([]metricChartSeries, len(datasets))
|
||||||
|
for i, name := range names {
|
||||||
|
color := metricChartPalette[i%len(metricChartPalette)]
|
||||||
|
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stats label from totals.
|
||||||
|
statsLabel := chartStatsLabel([][]float64{total})
|
||||||
|
|
||||||
|
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||||
|
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||||
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
|
writeHorizontalGrid(&b, layout, scale)
|
||||||
|
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||||
|
writePlotBorder(&b, layout)
|
||||||
|
writeSingleAxisY(&b, layout, scale)
|
||||||
|
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||||
|
|
||||||
|
// Draw stacked areas from top to bottom so lower layers are visible.
|
||||||
|
for i := len(datasets) - 1; i >= 0; i-- {
|
||||||
|
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
|
||||||
|
}
|
||||||
|
// Draw border polylines on top.
|
||||||
|
for i := len(datasets) - 1; i >= 0; i-- {
|
||||||
|
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
|
||||||
|
}
|
||||||
|
|
||||||
|
writeLegend(&b, layout, legendItems)
|
||||||
|
writeSVGClose(&b)
|
||||||
|
return []byte(b.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeStackedArea draws a filled polygon between two cumulative value arrays
|
||||||
|
// (baseline and top), using the given color at 55% opacity.
|
||||||
|
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
|
||||||
|
n := len(top)
|
||||||
|
if n == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(baseline) < n {
|
||||||
|
baseline = make([]float64, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forward path along top values, then backward along baseline values.
|
||||||
|
var points strings.Builder
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
if i > 0 {
|
||||||
|
points.WriteByte(' ')
|
||||||
|
}
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
for i := n - 1; i >= 0; i-- {
|
||||||
|
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
points.WriteByte(' ')
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
|
||||||
|
}
|
||||||
|
|
||||||
func writeSVGOpen(b *strings.Builder, width, height int) {
|
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||||
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1383,10 +1383,59 @@ type validateInventory struct {
|
|||||||
Storage string
|
Storage string
|
||||||
NVIDIA string
|
NVIDIA string
|
||||||
AMD string
|
AMD string
|
||||||
|
NvidiaGPUCount int
|
||||||
|
AMDGPUCount int
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string.
|
||||||
|
func validateFmtDur(secs int) string {
|
||||||
|
if secs < 120 {
|
||||||
|
return fmt.Sprintf("~%d s", secs)
|
||||||
|
}
|
||||||
|
mins := (secs + 29) / 60
|
||||||
|
return fmt.Sprintf("~%d min", mins)
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateTotalValidateSec returns the estimated wall-clock duration of
|
||||||
|
// "Validate one by one" in Validate mode for n NVIDIA GPUs.
|
||||||
|
func validateTotalValidateSec(n int) int {
|
||||||
|
if n < 0 {
|
||||||
|
n = 0
|
||||||
|
}
|
||||||
|
total := platform.SATEstimatedCPUValidateSec +
|
||||||
|
platform.SATEstimatedMemoryValidateSec +
|
||||||
|
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
||||||
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateTotalStressSec returns the estimated wall-clock duration of
|
||||||
|
// "Validate one by one" in Stress mode for n NVIDIA GPUs.
|
||||||
|
func validateTotalStressSec(n int) int {
|
||||||
|
if n < 0 {
|
||||||
|
n = 0
|
||||||
|
}
|
||||||
|
total := platform.SATEstimatedCPUStressSec +
|
||||||
|
platform.SATEstimatedMemoryStressSec +
|
||||||
|
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
||||||
|
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
||||||
|
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
||||||
|
platform.SATEstimatedNvidiaPulseTestSec +
|
||||||
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
return total
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderValidate(opts HandlerOptions) string {
|
func renderValidate(opts HandlerOptions) string {
|
||||||
inv := loadValidateInventory(opts)
|
inv := loadValidateInventory(opts)
|
||||||
|
n := inv.NvidiaGPUCount
|
||||||
|
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||||
|
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||||||
|
gpuNote := ""
|
||||||
|
if n > 0 {
|
||||||
|
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||||
|
}
|
||||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
@@ -1396,10 +1445,10 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
<div class="validate-profile-col">
|
<div class="validate-profile-col">
|
||||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="validate-profile-col validate-profile-action">
|
<div class="validate-profile-col validate-profile-action">
|
||||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||||
<div style="margin-top:12px">
|
<div style="margin-top:12px">
|
||||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
@@ -1413,19 +1462,19 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.CPU,
|
inv.CPU,
|
||||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||||
`60s in Validate, 30 min in Stress.`,
|
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||||
inv.Memory,
|
inv.Memory,
|
||||||
`Runs a RAM validation pass and records memory state around the test.`,
|
`Runs a RAM validation pass and records memory state around the test.`,
|
||||||
`<code>free</code>, <code>memtester</code>`,
|
`<code>free</code>, <code>memtester</code>`,
|
||||||
`256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
|
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||||
inv.Storage,
|
inv.Storage,
|
||||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||||
`Short self-test in Validate, extended self-test in Stress.`,
|
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>
|
`</div>
|
||||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
@@ -1450,14 +1499,33 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
|
func() string {
|
||||||
|
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
||||||
|
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
||||||
|
if n > 0 {
|
||||||
|
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
||||||
|
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
||||||
|
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
||||||
|
validateFmtDur(perV), validateFmtDur(perS))
|
||||||
|
}(),
|
||||||
)) +
|
)) +
|
||||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
func() string {
|
||||||
|
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
||||||
|
s := "Skipped in Validate. "
|
||||||
|
if n > 0 {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||||
|
} else {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||||
|
}
|
||||||
|
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||||
|
}(),
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-targeted-power">` +
|
`<div id="sat-card-nvidia-targeted-power">` +
|
||||||
@@ -1465,7 +1533,16 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
`<code>dcgmi diag targeted_power</code>`,
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
func() string {
|
||||||
|
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
||||||
|
s := "Skipped in Validate. "
|
||||||
|
if n > 0 {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||||
|
} else {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||||
|
}
|
||||||
|
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||||
|
}(),
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-pulse">` +
|
`<div id="sat-card-nvidia-pulse">` +
|
||||||
@@ -1473,7 +1550,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||||
`<code>dcgmi diag pulse_test</code>`,
|
`<code>dcgmi diag pulse_test</code>`,
|
||||||
`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-interconnect">` +
|
`<div id="sat-card-nvidia-interconnect">` +
|
||||||
@@ -1481,7 +1558,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||||
`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
|
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-bandwidth">` +
|
`<div id="sat-card-nvidia-bandwidth">` +
|
||||||
@@ -1489,7 +1566,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||||
`<code>nvbandwidth</code>`,
|
`<code>nvbandwidth</code>`,
|
||||||
`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
|
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`</div>
|
`</div>
|
||||||
@@ -1922,6 +1999,8 @@ func loadValidateInventory(opts HandlerOptions) validateInventory {
|
|||||||
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||||
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||||
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||||
|
out.NvidiaGPUCount = nvidiaTotal
|
||||||
|
out.AMDGPUCount = amdTotal
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -575,12 +575,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
}
|
}
|
||||||
timeline := metricsTimelineSegments(samples, time.Now())
|
timeline := metricsTimelineSegments(samples, time.Now())
|
||||||
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||||
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
var overviewOk bool
|
||||||
|
var buf []byte
|
||||||
|
buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if !ok {
|
if !overviewOk {
|
||||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -589,13 +591,26 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
_, _ = w.Write(buf)
|
_, _ = w.Write(buf)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
buf, err := renderMetricChartSVG(
|
var buf []byte
|
||||||
|
if stacked {
|
||||||
|
buf, err = renderStackedMetricChartSVG(
|
||||||
|
title,
|
||||||
|
labels,
|
||||||
|
sampleTimes(samples),
|
||||||
|
datasets,
|
||||||
|
names,
|
||||||
|
yMax,
|
||||||
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
buf, err = renderMetricChartSVG(
|
||||||
title,
|
title,
|
||||||
labels,
|
labels,
|
||||||
sampleTimes(samples),
|
sampleTimes(samples),
|
||||||
@@ -606,6 +621,7 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
chartCanvasHeightForPath(path, len(names)),
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
timeline,
|
timeline,
|
||||||
)
|
)
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -615,12 +631,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
_, _ = w.Write(buf)
|
_, _ = w.Write(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
|
||||||
var datasets [][]float64
|
labels = sampleTimeLabels(samples)
|
||||||
var names []string
|
|
||||||
var title string
|
|
||||||
var yMin, yMax *float64
|
|
||||||
labels := sampleTimeLabels(samples)
|
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case path == "server-load":
|
case path == "server-load":
|
||||||
@@ -656,6 +668,31 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
|
|
||||||
case path == "server-power":
|
case path == "server-power":
|
||||||
title = "System Power"
|
title = "System Power"
|
||||||
|
// Use per-PSU stacked chart when PSU SDR data is available.
|
||||||
|
// Collect the union of PSU slots seen across all samples.
|
||||||
|
psuSlots := psuSlotsFromSamples(samples)
|
||||||
|
if len(psuSlots) > 1 {
|
||||||
|
// Build one dataset per PSU slot.
|
||||||
|
psuDatasets := make([][]float64, len(psuSlots))
|
||||||
|
psuNames := make([]string, len(psuSlots))
|
||||||
|
for si, slot := range psuSlots {
|
||||||
|
ds := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
for _, psu := range s.PSUs {
|
||||||
|
if psu.Slot == slot {
|
||||||
|
ds[i] = psu.PowerW
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
psuDatasets[si] = normalizePowerSeries(ds)
|
||||||
|
psuNames[si] = fmt.Sprintf("PSU %d", slot)
|
||||||
|
}
|
||||||
|
datasets = psuDatasets
|
||||||
|
names = psuNames
|
||||||
|
stacked = true
|
||||||
|
yMax = autoMax120(psuStackedTotal(psuDatasets))
|
||||||
|
} else {
|
||||||
power := make([]float64, len(samples))
|
power := make([]float64, len(samples))
|
||||||
for i, s := range samples {
|
for i, s := range samples {
|
||||||
power[i] = s.PowerW
|
power[i] = s.PowerW
|
||||||
@@ -665,6 +702,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
names = []string{"Power W"}
|
names = []string{"Power W"}
|
||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = autoMax120(power)
|
yMax = autoMax120(power)
|
||||||
|
}
|
||||||
|
|
||||||
case path == "server-fans":
|
case path == "server-fans":
|
||||||
title = "Fan RPM"
|
title = "Fan RPM"
|
||||||
@@ -707,7 +745,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
case strings.HasPrefix(path, "gpu/"):
|
case strings.HasPrefix(path, "gpu/"):
|
||||||
idx, sub, ok := parseGPUChartPath(path)
|
idx, sub, ok := parseGPUChartPath(path)
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
switch sub {
|
switch sub {
|
||||||
case "load":
|
case "load":
|
||||||
@@ -715,7 +753,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||||
if util == nil && mem == nil {
|
if util == nil && mem == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||||
names = []string{"Load %", "Mem %"}
|
names = []string{"Load %", "Mem %"}
|
||||||
@@ -725,7 +763,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Temperature"
|
title = gpuDisplayLabel(idx) + " Temperature"
|
||||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
if temp == nil {
|
if temp == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{temp}
|
datasets = [][]float64{temp}
|
||||||
names = []string{"Temp °C"}
|
names = []string{"Temp °C"}
|
||||||
@@ -735,7 +773,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Core Clock"
|
title = gpuDisplayLabel(idx) + " Core Clock"
|
||||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
if clock == nil {
|
if clock == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{clock}
|
datasets = [][]float64{clock}
|
||||||
names = []string{"Core Clock MHz"}
|
names = []string{"Core Clock MHz"}
|
||||||
@@ -744,7 +782,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Memory Clock"
|
title = gpuDisplayLabel(idx) + " Memory Clock"
|
||||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||||
if clock == nil {
|
if clock == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{clock}
|
datasets = [][]float64{clock}
|
||||||
names = []string{"Memory Clock MHz"}
|
names = []string{"Memory Clock MHz"}
|
||||||
@@ -753,7 +791,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Power"
|
title = gpuDisplayLabel(idx) + " Power"
|
||||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
if power == nil {
|
if power == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{power}
|
datasets = [][]float64{power}
|
||||||
names = []string{"Power W"}
|
names = []string{"Power W"}
|
||||||
@@ -761,10 +799,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
}
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
|
|
||||||
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
|
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
|
||||||
@@ -930,6 +968,37 @@ func normalizePowerSeries(ds []float64) []float64 {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
|
||||||
|
func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
|
||||||
|
seen := map[int]struct{}{}
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, p := range s.PSUs {
|
||||||
|
seen[p.Slot] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slots := make([]int, 0, len(seen))
|
||||||
|
for s := range seen {
|
||||||
|
slots = append(slots, s)
|
||||||
|
}
|
||||||
|
sort.Ints(slots)
|
||||||
|
return slots
|
||||||
|
}
|
||||||
|
|
||||||
|
// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
|
||||||
|
func psuStackedTotal(datasets [][]float64) []float64 {
|
||||||
|
if len(datasets) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
n := len(datasets[0])
|
||||||
|
total := make([]float64, n)
|
||||||
|
for _, ds := range datasets {
|
||||||
|
for i, v := range ds {
|
||||||
|
total[i] += v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
func normalizeFanSeries(ds []float64) []float64 {
|
func normalizeFanSeries(ds []float64) []float64 {
|
||||||
if len(ds) == 0 {
|
if len(ds) == 0 {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("chartDataFromSamples returned ok=false")
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -164,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("chartDataFromSamples returned ok=false")
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -209,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("gpu-all-clock returned ok=false")
|
t.Fatal("gpu-all-clock returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -754,9 +754,9 @@ func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
|||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
`NVIDIA Interconnect (NCCL)`,
|
`NVIDIA Interconnect (NCCL)`,
|
||||||
`Runs in Validate and Stress.`,
|
`Validate and Stress:`,
|
||||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||||
`Intended to stay short enough for Validate.`,
|
`nvbandwidth runs all built-in tests without a time limit`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||||
|
|||||||
@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
|
|||||||
}
|
}
|
||||||
return gpuDisplayLabel(idx) + " Overview", buf, true
|
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||||
}
|
}
|
||||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
return "", nil, false
|
return "", nil, false
|
||||||
}
|
}
|
||||||
buf, err := renderMetricChartSVG(
|
var buf []byte
|
||||||
title,
|
var err error
|
||||||
labels,
|
if stacked {
|
||||||
sampleTimes(samples),
|
buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||||
datasets,
|
} else {
|
||||||
names,
|
buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||||
yMin,
|
}
|
||||||
yMax,
|
|
||||||
chartCanvasHeightForPath(path, len(names)),
|
|
||||||
timeline,
|
|
||||||
)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, false
|
return "", nil, false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -613,8 +613,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
}
|
}
|
||||||
a := q.opts.App
|
a := q.opts.App
|
||||||
|
|
||||||
|
recovered := len(j.lines) > 0
|
||||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||||
if len(j.lines) > 0 {
|
if recovered {
|
||||||
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,41 @@ This applies to:
|
|||||||
- `iso/builder/config/package-lists/*.list.chroot`
|
- `iso/builder/config/package-lists/*.list.chroot`
|
||||||
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||||
|
|
||||||
|
## Bootloader sync rule
|
||||||
|
|
||||||
|
The ISO has two independent bootloader configs that must be kept in sync manually:
|
||||||
|
|
||||||
|
| File | Used by |
|
||||||
|
|------|---------|
|
||||||
|
| `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
|
||||||
|
| `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
|
||||||
|
|
||||||
|
live-build does NOT derive one from the other. Any new boot entry, kernel parameter
|
||||||
|
change, or new mode added to one file must be manually mirrored in the other.
|
||||||
|
|
||||||
|
**Canonical entry list** (both files must have all of these):
|
||||||
|
|
||||||
|
| Label | Key params |
|
||||||
|
|-------|-----------|
|
||||||
|
| normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
|
||||||
|
| load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
|
||||||
|
| GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
|
||||||
|
| KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
|
||||||
|
| KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
|
||||||
|
| fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
|
||||||
|
|
||||||
|
**Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
|
||||||
|
```
|
||||||
|
net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
|
||||||
|
numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||||
|
nowatchdog nosoftlockup
|
||||||
|
```
|
||||||
|
(fail-safe is the exception — it deliberately uses minimal params.)
|
||||||
|
|
||||||
|
**Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
|
||||||
|
live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
|
||||||
|
configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
|
||||||
|
|
||||||
## Memtest rule
|
## Memtest rule
|
||||||
|
|
||||||
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||||
|
|||||||
@@ -16,6 +16,11 @@ menuentry "EASY-BEE" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
submenu "EASY-BEE (advanced options) -->" {
|
submenu "EASY-BEE (advanced options) -->" {
|
||||||
|
menuentry "EASY-BEE — load to RAM (toram)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE — GSP=off" {
|
menuentry "EASY-BEE — GSP=off" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
@@ -26,6 +31,11 @@ submenu "EASY-BEE (advanced options) -->" {
|
|||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE — KMS + GSP=off" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE — fail-safe" {
|
menuentry "EASY-BEE — fail-safe" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
|
|||||||
menu default
|
menu default
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-kms
|
|
||||||
menu label EASY-BEE (^graphics/KMS)
|
|
||||||
linux @LINUX@
|
|
||||||
initrd @INITRD@
|
|
||||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
label live-@FLAVOUR@-toram
|
||||||
menu label EASY-BEE (^load to RAM)
|
menu label EASY-BEE (^load to RAM)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-gsp-off
|
label live-@FLAVOUR@-gsp-off
|
||||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-kms-gsp-off
|
label live-@FLAVOUR@-kms
|
||||||
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
menu label EASY-BEE (^KMS, no nomodeset)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms-gsp-off
|
||||||
|
menu label EASY-BEE (KMS, ^GSP=off)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-failsafe
|
label live-@FLAVOUR@-failsafe
|
||||||
menu label EASY-BEE (^fail-safe)
|
menu label EASY-BEE (^fail-safe)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
|
|
||||||
label memtest
|
label memtest
|
||||||
menu label ^Memory Test (memtest86+)
|
menu label ^Memory Test (memtest86+)
|
||||||
|
|||||||
@@ -65,6 +65,8 @@ chmod +x /usr/local/bin/bee 2>/dev/null || true
|
|||||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
|||||||
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9011-toram-rsync.hook.chroot
|
||||||
|
#
|
||||||
|
# Adds rsync to the initramfs so that live-boot's toram code takes the
|
||||||
|
# rsync --progress path instead of the silent "cp -a" fallback.
|
||||||
|
#
|
||||||
|
# live-boot's 9990-toram-todisk.sh already contains:
|
||||||
|
# if [ -x /bin/rsync ]; then
|
||||||
|
# rsync -a --progress ... 1>/dev/console
|
||||||
|
# else
|
||||||
|
# cp -a ... # no output
|
||||||
|
# fi
|
||||||
|
#
|
||||||
|
# We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
|
||||||
|
# which copies the binary + all shared-library dependencies into the initrd.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
HOOK_DIR="/etc/initramfs-tools/hooks"
|
||||||
|
HOOK="${HOOK_DIR}/bee-rsync"
|
||||||
|
|
||||||
|
mkdir -p "${HOOK_DIR}"
|
||||||
|
|
||||||
|
cat > "${HOOK}" << 'EOF'
|
||||||
|
#!/bin/sh
|
||||||
|
# initramfs hook: include rsync for live-boot toram progress output
|
||||||
|
PREREQ=""
|
||||||
|
prereqs() { echo "$PREREQ"; }
|
||||||
|
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||||
|
|
||||||
|
. /usr/share/initramfs-tools/hook-functions
|
||||||
|
|
||||||
|
if [ -x /usr/bin/rsync ]; then
|
||||||
|
copy_exec /usr/bin/rsync /bin
|
||||||
|
fi
|
||||||
|
EOF
|
||||||
|
|
||||||
|
chmod +x "${HOOK}"
|
||||||
|
|
||||||
|
echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
|
||||||
|
|
||||||
|
# Rebuild initramfs so the hook takes effect in the ISO's initrd.img
|
||||||
|
KVER=$(ls /lib/modules | sort -V | tail -1)
|
||||||
|
echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
|
||||||
|
update-initramfs -u -k "${KVER}"
|
||||||
|
echo "9011-toram-rsync: done"
|
||||||
@@ -3,6 +3,7 @@ dmidecode
|
|||||||
smartmontools
|
smartmontools
|
||||||
nvme-cli
|
nvme-cli
|
||||||
pciutils
|
pciutils
|
||||||
|
rsync
|
||||||
ipmitool
|
ipmitool
|
||||||
util-linux
|
util-linux
|
||||||
e2fsprogs
|
e2fsprogs
|
||||||
|
|||||||
@@ -65,6 +65,9 @@ done
|
|||||||
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
||||||
if [ ! -f "$SQUASHFS" ]; then
|
if [ ! -f "$SQUASHFS" ]; then
|
||||||
echo "ERROR: squashfs not found at $SQUASHFS" >&2
|
echo "ERROR: squashfs not found at $SQUASHFS" >&2
|
||||||
|
echo " The live medium may have been disconnected." >&2
|
||||||
|
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
|
||||||
|
echo " Then re-run bee-install." >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -162,10 +165,59 @@ log " Mounted."
|
|||||||
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
||||||
log " Source: $SQUASHFS"
|
log " Source: $SQUASHFS"
|
||||||
log " Target: $MOUNT_ROOT"
|
log " Target: $MOUNT_ROOT"
|
||||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
|
||||||
grep -E '^\[|^inod|^created|^extract' | \
|
# unsquashfs does not support resume, so retry the entire unpack step if the
|
||||||
while read -r line; do log " $line"; done || true
|
# source medium disappears mid-copy (e.g. CD physically disconnected).
|
||||||
log " Unpack complete."
|
UNPACK_ATTEMPTS=0
|
||||||
|
UNPACK_MAX=5
|
||||||
|
while true; do
|
||||||
|
UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
|
||||||
|
if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
|
||||||
|
die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
|
||||||
|
fi
|
||||||
|
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
|
||||||
|
|
||||||
|
# Re-check squashfs is reachable before each attempt
|
||||||
|
if [ ! -f "$SQUASHFS" ]; then
|
||||||
|
log " SOURCE LOST: $SQUASHFS not found."
|
||||||
|
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
|
||||||
|
log " then press Enter here to retry."
|
||||||
|
read -r _
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# wipe partial unpack so unsquashfs starts clean
|
||||||
|
if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
|
||||||
|
log " Cleaning partial unpack from $MOUNT_ROOT ..."
|
||||||
|
# keep the mount point itself but remove its contents
|
||||||
|
find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
UNPACK_OK=0
|
||||||
|
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
||||||
|
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
||||||
|
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
||||||
|
|
||||||
|
# Check squashfs is still reachable (gone = disc pulled during copy)
|
||||||
|
if [ ! -f "$SQUASHFS" ]; then
|
||||||
|
log " WARNING: source medium lost during unpack — will retry after remount."
|
||||||
|
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
|
||||||
|
read -r _
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify the unpack produced a usable root (presence of /etc is a basic check)
|
||||||
|
if [ -d "${MOUNT_ROOT}/etc" ]; then
|
||||||
|
log " Unpack complete."
|
||||||
|
break
|
||||||
|
else
|
||||||
|
log " WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
|
||||||
|
if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
|
||||||
|
log " Retrying in 5 s ..."
|
||||||
|
sleep 5
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
log "--- Step 6/7: Configuring installed system ---"
|
log "--- Step 6/7: Configuring installed system ---"
|
||||||
|
|||||||
100
iso/overlay/usr/local/bin/bee-remount-medium
Normal file
100
iso/overlay/usr/local/bin/bee-remount-medium
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
|
||||||
|
#
|
||||||
|
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
|
||||||
|
# was lost and /run/live/medium/live/filesystem.squashfs is missing.
|
||||||
|
#
|
||||||
|
# Usage: bee-remount-medium [--wait]
|
||||||
|
# --wait keep retrying every 5 seconds until the medium is found (useful
|
||||||
|
# while physically reconnecting the device)
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
MEDIUM_DIR="/run/live/medium"
|
||||||
|
SQUASHFS_REL="live/filesystem.squashfs"
|
||||||
|
WAIT_MODE=0
|
||||||
|
|
||||||
|
for arg in "$@"; do
|
||||||
|
case "$arg" in
|
||||||
|
--wait|-w) WAIT_MODE=1 ;;
|
||||||
|
--help|-h)
|
||||||
|
echo "Usage: bee-remount-medium [--wait]"
|
||||||
|
echo " Finds and remounts the live ISO medium to $MEDIUM_DIR"
|
||||||
|
echo " --wait retry every 5 s until a medium with squashfs is found"
|
||||||
|
exit 0 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||||
|
die() { log "ERROR: $*" >&2; exit 1; }
|
||||||
|
|
||||||
|
# Return all candidate block devices (optical + removable USB mass storage)
|
||||||
|
find_candidates() {
|
||||||
|
# CD/DVD drives
|
||||||
|
for dev in /dev/sr* /dev/scd*; do
|
||||||
|
[ -b "$dev" ] && echo "$dev"
|
||||||
|
done
|
||||||
|
# USB/removable disks and partitions
|
||||||
|
for dev in /dev/sd* /dev/vd*; do
|
||||||
|
[ -b "$dev" ] || continue
|
||||||
|
# Only whole disks or partitions — skip the same device we are running from
|
||||||
|
local removable
|
||||||
|
local base
|
||||||
|
base=$(basename "$dev")
|
||||||
|
removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
|
||||||
|
[ "$removable" = "1" ] && echo "$dev"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# Try to mount $1 to $MEDIUM_DIR and check for squashfs
|
||||||
|
try_mount() {
|
||||||
|
local dev="$1"
|
||||||
|
local tmpdir
|
||||||
|
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
|
||||||
|
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
|
||||||
|
if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
|
||||||
|
# Unmount probe mount and mount properly onto live path
|
||||||
|
umount "$tmpdir" 2>/dev/null || true
|
||||||
|
rmdir "$tmpdir" 2>/dev/null || true
|
||||||
|
# Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
|
||||||
|
umount "$MEDIUM_DIR" 2>/dev/null || true
|
||||||
|
mkdir -p "$MEDIUM_DIR"
|
||||||
|
if mount -o ro "$dev" "$MEDIUM_DIR"; then
|
||||||
|
log "Mounted $dev on $MEDIUM_DIR"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
log "Mount of $dev on $MEDIUM_DIR failed"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
umount "$tmpdir" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
rmdir "$tmpdir" 2>/dev/null || true
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
attempt() {
|
||||||
|
log "Scanning for ISO medium..."
|
||||||
|
for dev in $(find_candidates); do
|
||||||
|
log " Trying $dev ..."
|
||||||
|
if try_mount "$dev"; then
|
||||||
|
local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
|
||||||
|
log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ "$WAIT_MODE" = "1" ]; then
|
||||||
|
log "Waiting for live medium (press Ctrl+C to abort)..."
|
||||||
|
while true; do
|
||||||
|
if attempt; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
log " Not found — retrying in 5 s (reconnect the disc now)"
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
else
|
||||||
|
attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user