Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 51b721aeb3 | |||
| bac89bb6e5 | |||
| 7a618da1f9 | |||
| 64ae1c0ff0 | |||
| 49050ca717 | |||
| 5ba72ab315 | |||
| 63363e9629 | |||
|
|
5285c0d101 | ||
|
|
dca4afb8d0 | ||
|
|
b4280941f5 | ||
|
|
f74976ec4c | ||
|
|
18e24a9aa5 |
@@ -146,7 +146,7 @@ type satRunner interface {
|
|||||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type runtimeChecker interface {
|
type runtimeChecker interface {
|
||||||
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
|
|||||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||||
body := "Results: " + path
|
body := "Results: " + path
|
||||||
if err != nil && err != context.Canceled {
|
if err != nil && err != context.Canceled {
|
||||||
body += "\nERROR: " + err.Error()
|
body += "\nERROR: " + err.Error()
|
||||||
|
|||||||
@@ -128,6 +128,7 @@ type fakeSAT struct {
|
|||||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||||
runNvidiaPulseFn func(string, int, []int) (string, error)
|
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||||
runNvidiaBandwidthFn func(string, []int) (string, error)
|
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||||
|
runNCCLFn func(string, []int) (string, error)
|
||||||
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||||
runMemoryFn func(string) (string, error)
|
runMemoryFn func(string) (string, error)
|
||||||
runStorageFn func(string) (string, error)
|
runStorageFn func(string) (string, error)
|
||||||
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
|||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNCCLFn != nil {
|
||||||
|
return f.runNCCLFn(baseDir, gpuIndices)
|
||||||
|
}
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var gotBaseDir string
|
||||||
|
var gotGPUIndices []int
|
||||||
|
a := &App{
|
||||||
|
sat: fakeSAT{
|
||||||
|
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
|
||||||
|
gotBaseDir = baseDir
|
||||||
|
gotGPUIndices = append([]int(nil), gpuIndices...)
|
||||||
|
return "/tmp/nccl-tests.tar.gz", nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("RunNCCLTests error: %v", err)
|
||||||
|
}
|
||||||
|
if path != "/tmp/nccl-tests.tar.gz" {
|
||||||
|
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
|
||||||
|
}
|
||||||
|
if gotBaseDir != "/tmp/sat" {
|
||||||
|
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
|
||||||
|
}
|
||||||
|
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
|
||||||
|
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -59,6 +59,9 @@ type benchmarkPowerCalibrationResult struct {
|
|||||||
// ≥20% while server fans were below 100% duty cycle — a signal that the
|
// ≥20% while server fans were below 100% duty cycle — a signal that the
|
||||||
// cooling system may not be correctly configured for full GPU load.
|
// cooling system may not be correctly configured for full GPU load.
|
||||||
CoolingWarning string
|
CoolingWarning string
|
||||||
|
// MetricRows holds the telemetry rows from the final (converged) attempt
|
||||||
|
// for this GPU. Used to build per-run gpu-metrics.csv.
|
||||||
|
MetricRows []GPUMetricRow
|
||||||
}
|
}
|
||||||
|
|
||||||
type benchmarkBurnProfile struct {
|
type benchmarkBurnProfile struct {
|
||||||
@@ -1122,6 +1125,7 @@ type benchmarkCoolingSample struct {
|
|||||||
AvgFanRPM float64
|
AvgFanRPM float64
|
||||||
AvgFanDutyCyclePct float64
|
AvgFanDutyCyclePct float64
|
||||||
FanDutyCycleAvailable bool
|
FanDutyCycleAvailable bool
|
||||||
|
FanDutyCycleEstimated bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||||
@@ -1134,6 +1138,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
samples[i].FanAvgRPM = fanSample.AvgFanRPM
|
samples[i].FanAvgRPM = fanSample.AvgFanRPM
|
||||||
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
|
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
|
||||||
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
|
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
|
||||||
|
samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
|
||||||
}
|
}
|
||||||
return samples, nil
|
return samples, nil
|
||||||
}
|
}
|
||||||
@@ -1141,11 +1146,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
|
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
|
||||||
fans, _ := sampleFanSpeeds()
|
fans, _ := sampleFanSpeeds()
|
||||||
avgRPM, _, _ := fanRPMStats(fans)
|
avgRPM, _, _ := fanRPMStats(fans)
|
||||||
dutyPct, dutyAvailable := sampleFanDutyCyclePct()
|
dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
|
||||||
return benchmarkCoolingSample{
|
return benchmarkCoolingSample{
|
||||||
AvgFanRPM: avgRPM,
|
AvgFanRPM: avgRPM,
|
||||||
AvgFanDutyCyclePct: dutyPct,
|
AvgFanDutyCyclePct: dutyPct,
|
||||||
FanDutyCycleAvailable: dutyAvailable,
|
FanDutyCycleAvailable: dutyAvailable,
|
||||||
|
FanDutyCycleEstimated: dutyEstimated,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1387,25 +1393,33 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
|
|||||||
}
|
}
|
||||||
var rpmValues []float64
|
var rpmValues []float64
|
||||||
var dutyValues []float64
|
var dutyValues []float64
|
||||||
|
var dutyEstimated bool
|
||||||
for _, row := range rows {
|
for _, row := range rows {
|
||||||
if row.FanAvgRPM > 0 {
|
if row.FanAvgRPM > 0 {
|
||||||
rpmValues = append(rpmValues, row.FanAvgRPM)
|
rpmValues = append(rpmValues, row.FanAvgRPM)
|
||||||
}
|
}
|
||||||
if row.FanDutyCycleAvailable {
|
if row.FanDutyCycleAvailable {
|
||||||
dutyValues = append(dutyValues, row.FanDutyCyclePct)
|
dutyValues = append(dutyValues, row.FanDutyCyclePct)
|
||||||
|
if row.FanDutyCycleEstimated {
|
||||||
|
dutyEstimated = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(rpmValues) == 0 && len(dutyValues) == 0 {
|
if len(rpmValues) == 0 && len(dutyValues) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
summary := &BenchmarkCoolingSummary{
|
summary := &BenchmarkCoolingSummary{
|
||||||
Available: true,
|
Available: true,
|
||||||
AvgFanRPM: benchmarkMean(rpmValues),
|
AvgFanRPM: benchmarkMean(rpmValues),
|
||||||
|
FanDutyCycleEstimated: dutyEstimated,
|
||||||
}
|
}
|
||||||
if len(dutyValues) > 0 {
|
if len(dutyValues) > 0 {
|
||||||
summary.FanDutyCycleAvailable = true
|
summary.FanDutyCycleAvailable = true
|
||||||
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
|
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
|
||||||
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
|
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
|
||||||
|
if summary.FanDutyCycleEstimated {
|
||||||
|
summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
|
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
|
||||||
}
|
}
|
||||||
@@ -2770,7 +2784,7 @@ func runBenchmarkPowerCalibration(
|
|||||||
infoByIndex map[int]benchmarkGPUInfo,
|
infoByIndex map[int]benchmarkGPUInfo,
|
||||||
logFunc func(string),
|
logFunc func(string),
|
||||||
seedLimits map[int]int,
|
seedLimits map[int]int,
|
||||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
|
||||||
const calibDurationSec = 120
|
const calibDurationSec = 120
|
||||||
const maxDerateW = 150
|
const maxDerateW = 150
|
||||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||||
@@ -2784,7 +2798,7 @@ func runBenchmarkPowerCalibration(
|
|||||||
|
|
||||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||||
return map[int]benchmarkPowerCalibrationResult{}, nil
|
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||||
}
|
}
|
||||||
if killed := KillTestWorkers(); len(killed) > 0 {
|
if killed := KillTestWorkers(); len(killed) > 0 {
|
||||||
for _, p := range killed {
|
for _, p := range killed {
|
||||||
@@ -2818,6 +2832,8 @@ func runBenchmarkPowerCalibration(
|
|||||||
|
|
||||||
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
|
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
|
||||||
var restore []benchmarkRestoreAction
|
var restore []benchmarkRestoreAction
|
||||||
|
var allCalibRows []GPUMetricRow // accumulated telemetry across all attempts
|
||||||
|
var calibCursor float64
|
||||||
|
|
||||||
// Initialise per-GPU state.
|
// Initialise per-GPU state.
|
||||||
states := make([]*gpuCalibState, 0, len(gpuIndices))
|
states := make([]*gpuCalibState, 0, len(gpuIndices))
|
||||||
@@ -2970,6 +2986,8 @@ calibDone:
|
|||||||
ticker.Stop()
|
ticker.Stop()
|
||||||
cancelAttempt()
|
cancelAttempt()
|
||||||
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
||||||
|
// Accumulate telemetry rows with attempt stage label.
|
||||||
|
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
|
||||||
|
|
||||||
// Resource busy: retry with exponential back-off (shared — one DCGM session).
|
// Resource busy: retry with exponential back-off (shared — one DCGM session).
|
||||||
if ar.err != nil && isDCGMResourceBusy(ar.err) {
|
if ar.err != nil && isDCGMResourceBusy(ar.err) {
|
||||||
@@ -3054,6 +3072,7 @@ calibDone:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||||
s.converged = true
|
s.converged = true
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -3092,6 +3111,7 @@ calibDone:
|
|||||||
} else {
|
} else {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||||
}
|
}
|
||||||
|
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||||
s.converged = true
|
s.converged = true
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -3129,7 +3149,8 @@ calibDone:
|
|||||||
results[s.idx] = s.calib
|
results[s.idx] = s.calib
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return results, restore
|
writeBenchmarkMetricsFiles(runDir, allCalibRows)
|
||||||
|
return results, restore, allCalibRows
|
||||||
}
|
}
|
||||||
|
|
||||||
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
||||||
@@ -3219,21 +3240,25 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
}
|
}
|
||||||
if len(result.RampSteps) > 0 {
|
if len(result.RampSteps) > 0 {
|
||||||
b.WriteString("## Ramp Sequence\n\n")
|
b.WriteString("## Ramp Sequence\n\n")
|
||||||
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
|
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
|
||||||
b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
|
b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
|
||||||
for _, step := range result.RampSteps {
|
for _, step := range result.RampSteps {
|
||||||
derated := "-"
|
derated := "-"
|
||||||
if step.Derated {
|
if step.Derated {
|
||||||
derated = "⚠ yes"
|
derated = "⚠ yes"
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
|
serverDelta := "-"
|
||||||
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
|
if step.ServerDeltaW > 0 {
|
||||||
|
serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
|
||||||
|
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
b.WriteString("## Per-Slot Results\n\n")
|
b.WriteString("## Per-Slot Results\n\n")
|
||||||
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
|
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
|
||||||
b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
|
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
stableLimit := "-"
|
stableLimit := "-"
|
||||||
if gpu.StablePowerLimitW > 0 {
|
if gpu.StablePowerLimitW > 0 {
|
||||||
@@ -3243,8 +3268,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
|
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
|
serverDelta := "-"
|
||||||
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
|
if gpu.ServerDeltaW > 0 {
|
||||||
|
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
|
||||||
|
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
@@ -3273,11 +3302,19 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
|
|||||||
fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
|
fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
|
||||||
fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
|
fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
|
||||||
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
|
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
|
||||||
|
if step.ServerLoadedW > 0 {
|
||||||
|
fmt.Fprintf(&b, "ramp_step_%d_server_loaded_w=%.0f\n", step.StepIndex, step.ServerLoadedW)
|
||||||
|
fmt.Fprintf(&b, "ramp_step_%d_server_delta_w=%.0f\n", step.StepIndex, step.ServerDeltaW)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
if gpu.StablePowerLimitW > 0 {
|
if gpu.StablePowerLimitW > 0 {
|
||||||
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
|
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
|
||||||
}
|
}
|
||||||
|
if gpu.ServerLoadedW > 0 {
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_server_loaded_w=%.0f\n", gpu.Index, gpu.ServerLoadedW)
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_server_delta_w=%.0f\n", gpu.Index, gpu.ServerDeltaW)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if sp := result.ServerPower; sp != nil && sp.Available {
|
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||||
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
|
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
|
||||||
@@ -3316,6 +3353,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
if infoErr != nil {
|
if infoErr != nil {
|
||||||
return "", infoErr
|
return "", infoErr
|
||||||
}
|
}
|
||||||
|
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||||
|
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||||
|
}
|
||||||
hostname, _ := os.Hostname()
|
hostname, _ := os.Hostname()
|
||||||
result := NvidiaPowerBenchResult{
|
result := NvidiaPowerBenchResult{
|
||||||
BenchmarkVersion: benchmarkVersion,
|
BenchmarkVersion: benchmarkVersion,
|
||||||
@@ -3341,13 +3382,31 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||||
|
singleIPMILoadedW := make(map[int]float64, len(selected))
|
||||||
var allRestoreActions []benchmarkRestoreAction
|
var allRestoreActions []benchmarkRestoreAction
|
||||||
|
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
|
||||||
|
var allPowerRows []GPUMetricRow
|
||||||
|
var powerCursor float64
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
|
ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
|
||||||
|
ipmiSingleDone := make(chan float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ipmiSingleDone)
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
|
||||||
|
ipmiSingleDone <- w
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
|
||||||
|
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||||
|
ipmiSingleCancel()
|
||||||
|
if w, ok := <-ipmiSingleDone; ok {
|
||||||
|
singleIPMILoadedW[idx] = w
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W", idx, w))
|
||||||
|
}
|
||||||
allRestoreActions = append(allRestoreActions, restore...)
|
allRestoreActions = append(allRestoreActions, restore...)
|
||||||
if r, ok := c[idx]; ok {
|
if r, ok := c[idx]; ok {
|
||||||
calibByIndex[idx] = r
|
calibByIndex[idx] = r
|
||||||
@@ -3372,7 +3431,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.OverallStatus = "PARTIAL"
|
result.OverallStatus = "PARTIAL"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gpus = append(gpus, NvidiaPowerBenchGPU{
|
gpu := NvidiaPowerBenchGPU{
|
||||||
Index: idx,
|
Index: idx,
|
||||||
Name: info.Name,
|
Name: info.Name,
|
||||||
BusID: info.BusID,
|
BusID: info.BusID,
|
||||||
@@ -3385,7 +3444,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
Status: status,
|
Status: status,
|
||||||
Notes: append([]string(nil), calib.Notes...),
|
Notes: append([]string(nil), calib.Notes...),
|
||||||
CoolingWarning: calib.CoolingWarning,
|
CoolingWarning: calib.CoolingWarning,
|
||||||
})
|
}
|
||||||
|
if w, ok := singleIPMILoadedW[idx]; ok && serverIdleOK && w > 0 {
|
||||||
|
gpu.ServerLoadedW = w
|
||||||
|
gpu.ServerDeltaW = w - serverIdleW
|
||||||
|
}
|
||||||
|
if len(calib.MetricRows) > 0 {
|
||||||
|
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
||||||
|
gpu.Telemetry = &t
|
||||||
|
}
|
||||||
|
gpus = append(gpus, gpu)
|
||||||
}
|
}
|
||||||
sort.Slice(gpus, func(i, j int) bool {
|
sort.Slice(gpus, func(i, j int) bool {
|
||||||
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
|
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
|
||||||
@@ -3434,20 +3502,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
|
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
|
||||||
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
|
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
|
||||||
|
|
||||||
// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
|
// serverLoadedW tracks the IPMI server power from the final ramp step
|
||||||
// server-side loaded power while GPUs are under stress. The goroutine is
|
// (all GPUs simultaneously loaded). Earlier steps' values are stored
|
||||||
// cancelled as soon as Phase 2 finishes, and the average is used to compare
|
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
|
||||||
// against PlatformMaxTDPW (GPU-reported stable limits sum).
|
|
||||||
var serverLoadedW float64
|
var serverLoadedW float64
|
||||||
var serverLoadedOK bool
|
var serverLoadedOK bool
|
||||||
ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
|
|
||||||
ipmiPhase2Done := make(chan float64, 1)
|
|
||||||
go func() {
|
|
||||||
defer close(ipmiPhase2Done)
|
|
||||||
if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
|
|
||||||
ipmiPhase2Done <- w
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Step 1: reuse single-card calibration result directly.
|
// Step 1: reuse single-card calibration result directly.
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
@@ -3464,6 +3523,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
Derated: firstCalib.Derated,
|
Derated: firstCalib.Derated,
|
||||||
Status: "OK",
|
Status: "OK",
|
||||||
}
|
}
|
||||||
|
if w, ok := singleIPMILoadedW[firstIdx]; ok && serverIdleOK && w > 0 {
|
||||||
|
ramp.ServerLoadedW = w
|
||||||
|
ramp.ServerDeltaW = w - serverIdleW
|
||||||
|
}
|
||||||
if !firstCalib.Completed {
|
if !firstCalib.Completed {
|
||||||
ramp.Status = "FAILED"
|
ramp.Status = "FAILED"
|
||||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
|
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
|
||||||
@@ -3491,17 +3554,45 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
_ = os.MkdirAll(stepDir, 0755)
|
_ = os.MkdirAll(stepDir, 0755)
|
||||||
|
|
||||||
// Reuse the latest stable limits as starting points, but re-check every
|
// Reuse the latest stable limits as starting points, but re-check every
|
||||||
// active GPU in this hotter configuration.
|
// active GPU in this hotter configuration. For the newly introduced GPU,
|
||||||
seedForStep := make(map[int]int, len(stableLimits))
|
// seed from its single-card calibration so we do not restart from the
|
||||||
for k, v := range stableLimits {
|
// default TDP when a prior derated limit is already known.
|
||||||
seedForStep[k] = v
|
seedForStep := make(map[int]int, len(subset))
|
||||||
|
for _, idx := range subset {
|
||||||
|
if lim, ok := stableLimits[idx]; ok && lim > 0 {
|
||||||
|
seedForStep[idx] = lim
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if base, ok := calibByIndex[idx]; ok {
|
||||||
|
lim := int(math.Round(base.AppliedPowerLimitW))
|
||||||
|
if lim > 0 {
|
||||||
|
seedForStep[idx] = lim
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
|
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
|
||||||
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
||||||
|
|
||||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
|
ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
|
||||||
|
ipmiStepDone := make(chan float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ipmiStepDone)
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
|
||||||
|
ipmiStepDone <- w
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
|
||||||
|
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||||
|
ipmiStepCancel()
|
||||||
|
var stepIPMILoadedW float64
|
||||||
|
var stepIPMIOK bool
|
||||||
|
if w, ok := <-ipmiStepDone; ok {
|
||||||
|
stepIPMILoadedW = w
|
||||||
|
stepIPMIOK = true
|
||||||
|
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
|
||||||
|
}
|
||||||
// Accumulate restore actions; they all run in the outer defer.
|
// Accumulate restore actions; they all run in the outer defer.
|
||||||
allRestoreActions = append(allRestoreActions, stepRestore...)
|
allRestoreActions = append(allRestoreActions, stepRestore...)
|
||||||
|
|
||||||
@@ -3564,15 +3655,17 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||||
}
|
}
|
||||||
|
|
||||||
result.RampSteps = append(result.RampSteps, ramp)
|
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
||||||
}
|
ramp.ServerLoadedW = stepIPMILoadedW
|
||||||
|
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
||||||
|
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
||||||
|
if step == len(result.RecommendedSlotOrder) {
|
||||||
|
serverLoadedW = stepIPMILoadedW
|
||||||
|
serverLoadedOK = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Stop IPMI Phase 2 sampling and collect result.
|
result.RampSteps = append(result.RampSteps, ramp)
|
||||||
ipmiPhase2Cancel()
|
|
||||||
if w, ok := <-ipmiPhase2Done; ok {
|
|
||||||
serverLoadedW = w
|
|
||||||
serverLoadedOK = true
|
|
||||||
logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
|
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
|
||||||
@@ -3602,6 +3695,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
||||||
_ = serverIdleOK // used implicitly via characterizeServerPower
|
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
||||||
|
// Write top-level gpu-metrics.csv/.html aggregating all phases.
|
||||||
|
writeBenchmarkMetricsFiles(runDir, allPowerRows)
|
||||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("marshal power result: %w", err)
|
return "", fmt.Errorf("marshal power result: %w", err)
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
|
|||||||
Available bool `json:"available"`
|
Available bool `json:"available"`
|
||||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||||
|
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||||
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
@@ -42,6 +43,31 @@ const (
|
|||||||
NvidiaBenchmarkProfileOvernight = "overnight"
|
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
||||||
|
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
||||||
|
// re-measure from actual task logs and update the constants here.
|
||||||
|
//
|
||||||
|
// Sources:
|
||||||
|
// - BenchmarkEstimatedPerfStandardSec: MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
|
||||||
|
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
||||||
|
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
||||||
|
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
||||||
|
// - BenchmarkEstimatedPowerStabilitySec: xFusion v8.17/v8.22 ramp 1-8: 1977-2002 s
|
||||||
|
const (
|
||||||
|
// Performance Benchmark (bee-gpu-burn).
|
||||||
|
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
||||||
|
// Sequential per-GPU mode scales approximately linearly.
|
||||||
|
BenchmarkEstimatedPerfStandardSec = 960 // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
|
||||||
|
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
||||||
|
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
||||||
|
|
||||||
|
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
||||||
|
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||||
|
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||||
|
BenchmarkEstimatedPowerStabilitySec = 2000 // ~33 min; stability profile converges faster (longer steady → faster convergence)
|
||||||
|
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
||||||
|
)
|
||||||
|
|
||||||
type NvidiaBenchmarkOptions struct {
|
type NvidiaBenchmarkOptions struct {
|
||||||
Profile string
|
Profile string
|
||||||
SizeMB int
|
SizeMB int
|
||||||
@@ -55,32 +81,32 @@ type NvidiaBenchmarkOptions struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaBenchmarkResult struct {
|
type NvidiaBenchmarkResult struct {
|
||||||
BenchmarkVersion string `json:"benchmark_version"`
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
GeneratedAt time.Time `json:"generated_at"`
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
Hostname string `json:"hostname,omitempty"`
|
Hostname string `json:"hostname,omitempty"`
|
||||||
ServerModel string `json:"server_model,omitempty"`
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile"`
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
RampStep int `json:"ramp_step,omitempty"`
|
RampStep int `json:"ramp_step,omitempty"`
|
||||||
RampTotal int `json:"ramp_total,omitempty"`
|
RampTotal int `json:"ramp_total,omitempty"`
|
||||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||||
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||||
// 100% = each added GPU contributes exactly its single-card throughput.
|
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||||
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||||
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||||
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
Warnings []string `json:"warnings,omitempty"`
|
Warnings []string `json:"warnings,omitempty"`
|
||||||
Normalization BenchmarkNormalization `json:"normalization"`
|
Normalization BenchmarkNormalization `json:"normalization"`
|
||||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkNormalization struct {
|
type BenchmarkNormalization struct {
|
||||||
@@ -223,8 +249,8 @@ type BenchmarkScorecard struct {
|
|||||||
|
|
||||||
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
||||||
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
||||||
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||||
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||||
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
||||||
|
|
||||||
// Temperature headroom: distance to the 100°C destruction threshold.
|
// Temperature headroom: distance to the 100°C destruction threshold.
|
||||||
@@ -300,22 +326,22 @@ type NvidiaPowerBenchResult struct {
|
|||||||
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||||
// cumulative thermal ramp. Represents the actual sustained power budget of
|
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||||
// this server under full GPU load. Use for rack power planning.
|
// this server under full GPU load. Use for rack power planning.
|
||||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||||
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
||||||
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
||||||
// actual wall-power draw as seen by the server's power supply.
|
// actual wall-power draw as seen by the server's power supply.
|
||||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchGPU struct {
|
type NvidiaPowerBenchGPU struct {
|
||||||
Index int `json:"index"`
|
Index int `json:"index"`
|
||||||
Name string `json:"name,omitempty"`
|
Name string `json:"name,omitempty"`
|
||||||
BusID string `json:"bus_id,omitempty"`
|
BusID string `json:"bus_id,omitempty"`
|
||||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||||
// StablePowerLimitW is the final fixed limit for this GPU after the
|
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||||
// cumulative thermal ramp. This is the limit at which the GPU operated
|
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||||
// stably with all other GPUs running simultaneously at their own limits.
|
// stably with all other GPUs running simultaneously at their own limits.
|
||||||
@@ -330,13 +356,20 @@ type NvidiaPowerBenchGPU struct {
|
|||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||||
|
// ServerLoadedW is the IPMI server power reading captured during this
|
||||||
|
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||||
|
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||||
|
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||||
|
// Telemetry holds the aggregated stats from the final converged calibration
|
||||||
|
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||||
|
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchStep struct {
|
type NvidiaPowerBenchStep struct {
|
||||||
StepIndex int `json:"step_index"`
|
StepIndex int `json:"step_index"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||||
NewGPUIndex int `json:"new_gpu_index"`
|
NewGPUIndex int `json:"new_gpu_index"`
|
||||||
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||||
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||||
@@ -344,20 +377,24 @@ type NvidiaPowerBenchStep struct {
|
|||||||
Derated bool `json:"derated,omitempty"`
|
Derated bool `json:"derated,omitempty"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
// ServerLoadedW is the IPMI server power reading captured during this
|
||||||
|
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||||
|
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||||
|
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||||
// scalability ramp-up phase of the performance benchmark.
|
// scalability ramp-up phase of the performance benchmark.
|
||||||
type NvidiaPerformanceRampStep struct {
|
type NvidiaPerformanceRampStep struct {
|
||||||
StepIndex int `json:"step_index"`
|
StepIndex int `json:"step_index"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||||
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||||
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||||
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||||
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||||
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||||
ScalabilityPct float64 `json:"scalability_pct"`
|
ScalabilityPct float64 `json:"scalability_pct"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ type GPUMetricRow struct {
|
|||||||
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||||
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||||
|
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||||
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
|||||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
|
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
|
||||||
for _, r := range rows {
|
for _, r := range rows {
|
||||||
dutyAvail := 0
|
dutyAvail := 0
|
||||||
if r.FanDutyCycleAvailable {
|
if r.FanDutyCycleAvailable {
|
||||||
dutyAvail = 1
|
dutyAvail = 1
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
|
dutyEstimated := 0
|
||||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
|
if r.FanDutyCycleEstimated {
|
||||||
|
dutyEstimated = 1
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
|
||||||
|
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
|
||||||
}
|
}
|
||||||
return os.WriteFile(path, b.Bytes(), 0644)
|
return os.WriteFile(path, b.Bytes(), 0644)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -140,26 +140,56 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
|||||||
}
|
}
|
||||||
|
|
||||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||||
if err != nil || len(squashfsFiles) == 0 {
|
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
|
||||||
}
|
|
||||||
|
|
||||||
free := freeMemBytes()
|
|
||||||
var needed int64
|
|
||||||
for _, sf := range squashfsFiles {
|
|
||||||
fi, err2 := os.Stat(sf)
|
|
||||||
if err2 != nil {
|
|
||||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
|
||||||
}
|
|
||||||
needed += fi.Size()
|
|
||||||
}
|
|
||||||
const headroom = 256 * 1024 * 1024
|
|
||||||
if free > 0 && needed+headroom > free {
|
|
||||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
|
||||||
humanBytes(needed+headroom), humanBytes(free))
|
|
||||||
}
|
|
||||||
|
|
||||||
dstDir := installToRAMDir
|
dstDir := installToRAMDir
|
||||||
|
|
||||||
|
// If the source medium is unavailable, check whether a previous run already
|
||||||
|
// produced a complete copy in RAM. If so, skip the copy phase and proceed
|
||||||
|
// directly to the loop-rebind / bind-mount steps.
|
||||||
|
if !sourceAvailable {
|
||||||
|
copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||||
|
if len(copiedFiles) > 0 {
|
||||||
|
log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
|
||||||
|
// Proceed to rebind with the already-copied files.
|
||||||
|
for _, dst := range copiedFiles {
|
||||||
|
base := filepath.Base(dst)
|
||||||
|
// Re-associate the loop device that was originally backed by the
|
||||||
|
// source file (now gone); find it by the old source path pattern.
|
||||||
|
srcGuess := "/run/live/medium/live/" + base
|
||||||
|
loopDev, lerr := findLoopForFile(srcGuess)
|
||||||
|
if lerr != nil {
|
||||||
|
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
|
||||||
|
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
|
||||||
|
} else {
|
||||||
|
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
goto bindMedium
|
||||||
|
}
|
||||||
|
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
free := freeMemBytes()
|
||||||
|
var needed int64
|
||||||
|
for _, sf := range squashfsFiles {
|
||||||
|
fi, err2 := os.Stat(sf)
|
||||||
|
if err2 != nil {
|
||||||
|
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||||
|
}
|
||||||
|
needed += fi.Size()
|
||||||
|
}
|
||||||
|
const headroom = 256 * 1024 * 1024
|
||||||
|
if free > 0 && needed+headroom > free {
|
||||||
|
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||||
|
humanBytes(needed+headroom), humanBytes(free))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if state.CopyPresent {
|
if state.CopyPresent {
|
||||||
log("Removing stale partial RAM copy before retry...")
|
log("Removing stale partial RAM copy before retry...")
|
||||||
}
|
}
|
||||||
@@ -199,6 +229,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bindMedium:
|
||||||
log("Copying remaining medium files...")
|
log("Copying remaining medium files...")
|
||||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||||
|
|||||||
@@ -18,11 +18,19 @@ type LiveMetricSample struct {
|
|||||||
Fans []FanReading `json:"fans"`
|
Fans []FanReading `json:"fans"`
|
||||||
Temps []TempReading `json:"temps"`
|
Temps []TempReading `json:"temps"`
|
||||||
PowerW float64 `json:"power_w"`
|
PowerW float64 `json:"power_w"`
|
||||||
|
PSUs []PSUReading `json:"psus,omitempty"`
|
||||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||||
MemLoadPct float64 `json:"mem_load_pct"`
|
MemLoadPct float64 `json:"mem_load_pct"`
|
||||||
GPUs []GPUMetricRow `json:"gpus"`
|
GPUs []GPUMetricRow `json:"gpus"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PSUReading is a per-slot power supply input power reading.
|
||||||
|
type PSUReading struct {
|
||||||
|
Slot int `json:"slot"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
PowerW float64 `json:"power_w"`
|
||||||
|
}
|
||||||
|
|
||||||
// TempReading is a named temperature sensor value.
|
// TempReading is a named temperature sensor value.
|
||||||
type TempReading struct {
|
type TempReading struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
@@ -57,6 +65,9 @@ func SampleLiveMetrics() LiveMetricSample {
|
|||||||
// System power — returns 0 if unavailable
|
// System power — returns 0 if unavailable
|
||||||
s.PowerW = sampleSystemPower()
|
s.PowerW = sampleSystemPower()
|
||||||
|
|
||||||
|
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||||
|
s.PSUs = samplePSUPower()
|
||||||
|
|
||||||
// CPU load — from /proc/stat
|
// CPU load — from /proc/stat
|
||||||
s.CPULoadPct = sampleCPULoadPct()
|
s.CPULoadPct = sampleCPULoadPct()
|
||||||
|
|
||||||
@@ -326,3 +337,65 @@ func compactAmbientTempName(chip, name string) string {
|
|||||||
}
|
}
|
||||||
return chip + " / " + name
|
return chip + " / " + name
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||||
|
// It parses `ipmitool sdr elist full` output looking for Power Supply entity
|
||||||
|
// sensors (entity ID "10.N") that report a value in Watts.
|
||||||
|
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||||
|
func samplePSUPower() []PSUReading {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// map slot → reading (keep highest-watt value per slot in case of duplicates)
|
||||||
|
type entry struct {
|
||||||
|
name string
|
||||||
|
powerW float64
|
||||||
|
}
|
||||||
|
bySlot := map[int]entry{}
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 5 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
|
||||||
|
if !strings.HasPrefix(entityID, "10.") {
|
||||||
|
continue // not a Power Supply entity
|
||||||
|
}
|
||||||
|
slotStr := strings.TrimPrefix(entityID, "10.")
|
||||||
|
slot, err := strconv.Atoi(slotStr)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
|
||||||
|
if !strings.Contains(strings.ToLower(valueField), "watts") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
valueFields := strings.Fields(valueField)
|
||||||
|
if len(valueFields) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
w, err := strconv.ParseFloat(valueFields[0], 64)
|
||||||
|
if err != nil || w <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sensorName := strings.TrimSpace(parts[0])
|
||||||
|
if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
|
||||||
|
bySlot[slot] = entry{name: sensorName, powerW: w}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(bySlot) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
slots := make([]int, 0, len(bySlot))
|
||||||
|
for s := range bySlot {
|
||||||
|
slots = append(slots, s)
|
||||||
|
}
|
||||||
|
sort.Ints(slots)
|
||||||
|
psus := make([]PSUReading, 0, len(slots))
|
||||||
|
for _, s := range slots {
|
||||||
|
e := bySlot[s]
|
||||||
|
psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
|
||||||
|
}
|
||||||
|
return psus
|
||||||
|
}
|
||||||
|
|||||||
@@ -20,6 +20,54 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Estimated wall-clock durations for each SAT/validate test, derived from real
|
||||||
|
// production logs in _benchmark/_v8/.
|
||||||
|
//
|
||||||
|
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
|
||||||
|
// the corresponding Run*Pack function change, re-measure the wall-clock duration
|
||||||
|
// from actual task logs and update the matching constant here.
|
||||||
|
//
|
||||||
|
// Sources:
|
||||||
|
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||||
|
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||||
|
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
||||||
|
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
||||||
|
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
||||||
|
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
||||||
|
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||||
|
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||||
|
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||||
|
const (
|
||||||
|
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
|
||||||
|
SATEstimatedCPUValidateSec = 65
|
||||||
|
// CPU stress: stress-ng 1800 s (stress mode default).
|
||||||
|
SATEstimatedCPUStressSec = 1800
|
||||||
|
|
||||||
|
// RAM: memtester 256 MB / 1 pass.
|
||||||
|
SATEstimatedMemoryValidateSec = 70
|
||||||
|
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||||
|
SATEstimatedMemoryStressSec = 140
|
||||||
|
|
||||||
|
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
||||||
|
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
||||||
|
|
||||||
|
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
||||||
|
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
||||||
|
|
||||||
|
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||||
|
SATEstimatedNvidiaPulseTestSec = 5000
|
||||||
|
|
||||||
|
// NCCL all_reduce_perf, all GPUs simultaneously.
|
||||||
|
SATEstimatedNvidiaInterconnectSec = 300
|
||||||
|
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
|
||||||
|
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
|
||||||
|
SATEstimatedNvidiaBandwidthSec = 2700
|
||||||
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
satExecCommand = exec.Command
|
satExecCommand = exec.Command
|
||||||
satLookPath = exec.LookPath
|
satLookPath = exec.LookPath
|
||||||
@@ -366,12 +414,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
|||||||
return string(raw), err
|
return string(raw), err
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
// detect GPU count
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
if err != nil {
|
||||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
return "", err
|
||||||
|
}
|
||||||
|
gpuCount := len(selected)
|
||||||
if gpuCount < 1 {
|
if gpuCount < 1 {
|
||||||
gpuCount = 1
|
gpuCount = 1
|
||||||
}
|
}
|
||||||
@@ -380,7 +430,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
|||||||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||||
}},
|
}, env: nvidiaVisibleDevicesEnv(selected)},
|
||||||
), logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
|
|||||||
UpdatedAt time.Time
|
UpdatedAt time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type fanObservationState struct {
|
||||||
|
MaxRPM map[string]float64 `json:"max_rpm"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type fanPeakCandidate struct {
|
||||||
|
FirstSeen time.Time
|
||||||
|
RPM float64
|
||||||
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
systemPowerCacheMu sync.Mutex
|
systemPowerCacheMu sync.Mutex
|
||||||
systemPowerCache cachedPowerReading
|
systemPowerCache cachedPowerReading
|
||||||
|
fanObservationMu sync.Mutex
|
||||||
|
fanObservation fanObservationState
|
||||||
|
fanObservationInit bool
|
||||||
|
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||||
)
|
)
|
||||||
|
|
||||||
const systemPowerHoldTTL = 15 * time.Second
|
const systemPowerHoldTTL = 15 * time.Second
|
||||||
|
|
||||||
|
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
|
||||||
|
|
||||||
|
const fanObservationMinPeakHold = time.Second
|
||||||
|
|
||||||
|
func normalizeObservedFanMaxRPM(rpm float64) float64 {
|
||||||
|
if rpm <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return math.Ceil(rpm/1000.0) * 1000.0
|
||||||
|
}
|
||||||
|
|
||||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||||
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
|||||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||||
|
updateFanObservation(fans, time.Now())
|
||||||
return fans, nil
|
return fans, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||||
if len(fans) > 0 {
|
if len(fans) > 0 {
|
||||||
|
updateFanObservation(fans, time.Now())
|
||||||
return fans, nil
|
return fans, nil
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
|||||||
return nil, sensorsErr
|
return nil, sensorsErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func loadFanObservationLocked() {
|
||||||
|
if fanObservationInit {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fanObservationInit = true
|
||||||
|
fanObservation.MaxRPM = make(map[string]float64)
|
||||||
|
raw, err := os.ReadFile(fanObservationStatePath)
|
||||||
|
if err != nil || len(raw) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var persisted fanObservationState
|
||||||
|
if json.Unmarshal(raw, &persisted) != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for name, rpm := range persisted.MaxRPM {
|
||||||
|
name = strings.TrimSpace(name)
|
||||||
|
if name == "" || rpm <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fanObservation.MaxRPM[name] = rpm
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func saveFanObservationLocked() {
|
||||||
|
if len(fanObservation.MaxRPM) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
dir := filepath.Dir(fanObservationStatePath)
|
||||||
|
if dir == "" || dir == "." {
|
||||||
|
dir = "/var/log/bee-sat"
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
raw, err := json.MarshalIndent(fanObservation, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func updateFanObservation(fans []FanReading, now time.Time) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fanObservationMu.Lock()
|
||||||
|
defer fanObservationMu.Unlock()
|
||||||
|
loadFanObservationLocked()
|
||||||
|
changed := false
|
||||||
|
for _, fan := range fans {
|
||||||
|
name := strings.TrimSpace(fan.Name)
|
||||||
|
if name == "" || fan.RPM <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
currentMax := fanObservation.MaxRPM[name]
|
||||||
|
if fan.RPM <= currentMax {
|
||||||
|
delete(fanPeakCandidates, name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if cand, ok := fanPeakCandidates[name]; ok {
|
||||||
|
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
|
||||||
|
newMax := math.Max(cand.RPM, fan.RPM)
|
||||||
|
if newMax > currentMax {
|
||||||
|
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
|
||||||
|
changed = true
|
||||||
|
}
|
||||||
|
delete(fanPeakCandidates, name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if fan.RPM > cand.RPM {
|
||||||
|
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
|
||||||
|
}
|
||||||
|
if changed {
|
||||||
|
saveFanObservationLocked()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
fanObservationMu.Lock()
|
||||||
|
defer fanObservationMu.Unlock()
|
||||||
|
loadFanObservationLocked()
|
||||||
|
var samples []float64
|
||||||
|
for _, fan := range fans {
|
||||||
|
name := strings.TrimSpace(fan.Name)
|
||||||
|
if name == "" || fan.RPM <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
maxRPM := fanObservation.MaxRPM[name]
|
||||||
|
if maxRPM <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
pct := fan.RPM / maxRPM * 100.0
|
||||||
|
if pct > 100 {
|
||||||
|
pct = 100
|
||||||
|
}
|
||||||
|
if pct < 0 {
|
||||||
|
pct = 0
|
||||||
|
}
|
||||||
|
samples = append(samples, pct)
|
||||||
|
}
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return benchmarkMean(samples), true
|
||||||
|
}
|
||||||
|
|
||||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||||
// Handles two formats:
|
// Handles two formats:
|
||||||
//
|
//
|
||||||
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
|||||||
|
|
||||||
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||||
// Returns the average duty cycle across all exposed PWM controls.
|
// Returns the average duty cycle across all exposed PWM controls.
|
||||||
func sampleFanDutyCyclePct() (float64, bool) {
|
func sampleFanDutyCyclePct() (float64, bool, bool) {
|
||||||
out, err := exec.Command("sensors", "-j").Output()
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
if err != nil || len(out) == 0 {
|
if err != nil || len(out) == 0 {
|
||||||
return 0, false
|
fans, fanErr := sampleFanSpeeds()
|
||||||
|
if fanErr != nil {
|
||||||
|
return 0, false, false
|
||||||
|
}
|
||||||
|
return sampleFanDutyCyclePctFromFans(fans)
|
||||||
}
|
}
|
||||||
return parseFanDutyCyclePctSensorsJSON(out)
|
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
|
||||||
|
return pct, ok, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return 0, false, false
|
||||||
|
}
|
||||||
|
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
|
||||||
|
return pct, true, true
|
||||||
|
}
|
||||||
|
return 0, false, false
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldPath := fanObservationStatePath
|
||||||
|
oldState := fanObservation
|
||||||
|
oldInit := fanObservationInit
|
||||||
|
oldCandidates := fanPeakCandidates
|
||||||
|
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
|
||||||
|
fanObservation = fanObservationState{}
|
||||||
|
fanObservationInit = false
|
||||||
|
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||||
|
t.Cleanup(func() {
|
||||||
|
fanObservationStatePath = oldPath
|
||||||
|
fanObservation = oldState
|
||||||
|
fanObservationInit = oldInit
|
||||||
|
fanPeakCandidates = oldCandidates
|
||||||
|
})
|
||||||
|
|
||||||
|
start := time.Unix(100, 0)
|
||||||
|
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
|
||||||
|
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
|
||||||
|
t.Fatalf("single-sample spike should not establish observed max")
|
||||||
|
}
|
||||||
|
|
||||||
|
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
|
||||||
|
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
|
||||||
|
|
||||||
|
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("expected estimated duty cycle from persisted observed max")
|
||||||
|
}
|
||||||
|
if got < 43 || got > 44 {
|
||||||
|
t.Fatalf("got=%v want ~43.3", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
fanObservation = fanObservationState{}
|
||||||
|
fanObservationInit = false
|
||||||
|
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||||
|
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("expected persisted observed max to be reloaded from disk")
|
||||||
|
}
|
||||||
|
if got < 43 || got > 44 {
|
||||||
|
t.Fatalf("reloaded got=%v want ~43.3", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseDCMIPowerReading(t *testing.T) {
|
func TestParseDCMIPowerReading(t *testing.T) {
|
||||||
raw := `
|
raw := `
|
||||||
Instantaneous power reading: 512 Watts
|
Instantaneous power reading: 512 Watts
|
||||||
|
|||||||
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
|
||||||
|
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
|
||||||
|
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
|
||||||
|
if len(cmd) != len(want) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if cmd[i] != want[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||||
if len(env) != 2 {
|
if len(env) != 2 {
|
||||||
|
|||||||
@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
|
||||||
|
// is visually "stacked" on top of the previous one. Intended for multi-PSU
|
||||||
|
// power charts where the filled area of each PSU shows its individual
|
||||||
|
// contribution and the total height equals the combined draw.
|
||||||
|
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
if pointCount == 0 {
|
||||||
|
pointCount = 1
|
||||||
|
labels = []string{""}
|
||||||
|
times = []time.Time{{}}
|
||||||
|
}
|
||||||
|
if len(labels) < pointCount {
|
||||||
|
padded := make([]string, pointCount)
|
||||||
|
copy(padded, labels)
|
||||||
|
labels = padded
|
||||||
|
}
|
||||||
|
if len(times) < pointCount {
|
||||||
|
times = synthesizeChartTimes(times, pointCount)
|
||||||
|
}
|
||||||
|
for i := range datasets {
|
||||||
|
if len(datasets[i]) == 0 {
|
||||||
|
datasets[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||||
|
pointCount = len(times)
|
||||||
|
|
||||||
|
// Build cumulative sums per time point.
|
||||||
|
cumulative := make([][]float64, len(datasets)+1)
|
||||||
|
for i := range cumulative {
|
||||||
|
cumulative[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
for i, ds := range datasets {
|
||||||
|
for j, v := range ds {
|
||||||
|
cumulative[i+1][j] = cumulative[i][j] + v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scale is based on the total (top cumulative row).
|
||||||
|
total := cumulative[len(cumulative)-1]
|
||||||
|
yMin := floatPtr(0)
|
||||||
|
if yMax == nil {
|
||||||
|
yMax = autoMax120(total)
|
||||||
|
}
|
||||||
|
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
|
||||||
|
|
||||||
|
legendItems := make([]metricChartSeries, len(datasets))
|
||||||
|
for i, name := range names {
|
||||||
|
color := metricChartPalette[i%len(metricChartPalette)]
|
||||||
|
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stats label from totals.
|
||||||
|
statsLabel := chartStatsLabel([][]float64{total})
|
||||||
|
|
||||||
|
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||||
|
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||||
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
|
writeHorizontalGrid(&b, layout, scale)
|
||||||
|
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||||
|
writePlotBorder(&b, layout)
|
||||||
|
writeSingleAxisY(&b, layout, scale)
|
||||||
|
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||||
|
|
||||||
|
// Draw stacked areas from top to bottom so lower layers are visible.
|
||||||
|
for i := len(datasets) - 1; i >= 0; i-- {
|
||||||
|
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
|
||||||
|
}
|
||||||
|
// Draw border polylines on top.
|
||||||
|
for i := len(datasets) - 1; i >= 0; i-- {
|
||||||
|
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
|
||||||
|
}
|
||||||
|
|
||||||
|
writeLegend(&b, layout, legendItems)
|
||||||
|
writeSVGClose(&b)
|
||||||
|
return []byte(b.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeStackedArea draws a filled polygon between two cumulative value arrays
|
||||||
|
// (baseline and top), using the given color at 55% opacity.
|
||||||
|
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
|
||||||
|
n := len(top)
|
||||||
|
if n == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(baseline) < n {
|
||||||
|
baseline = make([]float64, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forward path along top values, then backward along baseline values.
|
||||||
|
var points strings.Builder
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
if i > 0 {
|
||||||
|
points.WriteByte(' ')
|
||||||
|
}
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
for i := n - 1; i >= 0; i-- {
|
||||||
|
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
points.WriteByte(' ')
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
|
||||||
|
}
|
||||||
|
|
||||||
func writeSVGOpen(b *strings.Builder, width, height int) {
|
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||||
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1378,15 +1378,64 @@ setInterval(loadMetricsLayout, 5000);
|
|||||||
// ── Validate (Acceptance Tests) ───────────────────────────────────────────────
|
// ── Validate (Acceptance Tests) ───────────────────────────────────────────────
|
||||||
|
|
||||||
type validateInventory struct {
|
type validateInventory struct {
|
||||||
CPU string
|
CPU string
|
||||||
Memory string
|
Memory string
|
||||||
Storage string
|
Storage string
|
||||||
NVIDIA string
|
NVIDIA string
|
||||||
AMD string
|
AMD string
|
||||||
|
NvidiaGPUCount int
|
||||||
|
AMDGPUCount int
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string.
|
||||||
|
func validateFmtDur(secs int) string {
|
||||||
|
if secs < 120 {
|
||||||
|
return fmt.Sprintf("~%d s", secs)
|
||||||
|
}
|
||||||
|
mins := (secs + 29) / 60
|
||||||
|
return fmt.Sprintf("~%d min", mins)
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateTotalValidateSec returns the estimated wall-clock duration of
|
||||||
|
// "Validate one by one" in Validate mode for n NVIDIA GPUs.
|
||||||
|
func validateTotalValidateSec(n int) int {
|
||||||
|
if n < 0 {
|
||||||
|
n = 0
|
||||||
|
}
|
||||||
|
total := platform.SATEstimatedCPUValidateSec +
|
||||||
|
platform.SATEstimatedMemoryValidateSec +
|
||||||
|
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
||||||
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateTotalStressSec returns the estimated wall-clock duration of
|
||||||
|
// "Validate one by one" in Stress mode for n NVIDIA GPUs.
|
||||||
|
func validateTotalStressSec(n int) int {
|
||||||
|
if n < 0 {
|
||||||
|
n = 0
|
||||||
|
}
|
||||||
|
total := platform.SATEstimatedCPUStressSec +
|
||||||
|
platform.SATEstimatedMemoryStressSec +
|
||||||
|
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
||||||
|
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
||||||
|
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
||||||
|
platform.SATEstimatedNvidiaPulseTestSec +
|
||||||
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
return total
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderValidate(opts HandlerOptions) string {
|
func renderValidate(opts HandlerOptions) string {
|
||||||
inv := loadValidateInventory(opts)
|
inv := loadValidateInventory(opts)
|
||||||
|
n := inv.NvidiaGPUCount
|
||||||
|
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||||
|
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||||||
|
gpuNote := ""
|
||||||
|
if n > 0 {
|
||||||
|
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||||
|
}
|
||||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
@@ -1396,10 +1445,10 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
<div class="validate-profile-col">
|
<div class="validate-profile-col">
|
||||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="validate-profile-col validate-profile-action">
|
<div class="validate-profile-col validate-profile-action">
|
||||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||||
<div style="margin-top:12px">
|
<div style="margin-top:12px">
|
||||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
@@ -1413,19 +1462,19 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.CPU,
|
inv.CPU,
|
||||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||||
`60s in Validate, 30 min in Stress.`,
|
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||||
inv.Memory,
|
inv.Memory,
|
||||||
`Runs a RAM validation pass and records memory state around the test.`,
|
`Runs a RAM validation pass and records memory state around the test.`,
|
||||||
`<code>free</code>, <code>memtester</code>`,
|
`<code>free</code>, <code>memtester</code>`,
|
||||||
`256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
|
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||||
inv.Storage,
|
inv.Storage,
|
||||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||||
`Short self-test in Validate, extended self-test in Stress.`,
|
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>
|
`</div>
|
||||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
@@ -1450,14 +1499,33 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
|
func() string {
|
||||||
|
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
||||||
|
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
||||||
|
if n > 0 {
|
||||||
|
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
||||||
|
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
||||||
|
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
||||||
|
validateFmtDur(perV), validateFmtDur(perS))
|
||||||
|
}(),
|
||||||
)) +
|
)) +
|
||||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
func() string {
|
||||||
|
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
||||||
|
s := "Skipped in Validate. "
|
||||||
|
if n > 0 {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||||
|
} else {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||||
|
}
|
||||||
|
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||||
|
}(),
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-targeted-power">` +
|
`<div id="sat-card-nvidia-targeted-power">` +
|
||||||
@@ -1465,7 +1533,16 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
`<code>dcgmi diag targeted_power</code>`,
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
func() string {
|
||||||
|
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
||||||
|
s := "Skipped in Validate. "
|
||||||
|
if n > 0 {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||||
|
} else {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||||
|
}
|
||||||
|
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||||
|
}(),
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-pulse">` +
|
`<div id="sat-card-nvidia-pulse">` +
|
||||||
@@ -1473,7 +1550,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||||
`<code>dcgmi diag pulse_test</code>`,
|
`<code>dcgmi diag pulse_test</code>`,
|
||||||
`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-interconnect">` +
|
`<div id="sat-card-nvidia-interconnect">` +
|
||||||
@@ -1481,7 +1558,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-bandwidth">` +
|
`<div id="sat-card-nvidia-bandwidth">` +
|
||||||
@@ -1489,7 +1566,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||||
`<code>nvbandwidth</code>`,
|
`<code>nvbandwidth</code>`,
|
||||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`</div>
|
`</div>
|
||||||
@@ -1527,8 +1604,6 @@ function satModeChanged() {
|
|||||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||||
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
|
|
||||||
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
|
|
||||||
].forEach(function(item) {
|
].forEach(function(item) {
|
||||||
const card = document.getElementById(item.card);
|
const card = document.getElementById(item.card);
|
||||||
if (card) {
|
if (card) {
|
||||||
@@ -1776,7 +1851,7 @@ function runAllSAT() {
|
|||||||
const cycles = 1;
|
const cycles = 1;
|
||||||
const status = document.getElementById('sat-all-status');
|
const status = document.getElementById('sat-all-status');
|
||||||
status.textContent = 'Enqueuing...';
|
status.textContent = 'Enqueuing...';
|
||||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||||
const activeTargets = baseTargets.filter(target => {
|
const activeTargets = baseTargets.filter(target => {
|
||||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||||
@@ -1924,6 +1999,8 @@ func loadValidateInventory(opts HandlerOptions) validateInventory {
|
|||||||
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||||
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||||
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||||
|
out.NvidiaGPUCount = nvidiaTotal
|
||||||
|
out.AMDGPUCount = amdTotal
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2016,9 +2093,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
|
|||||||
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
type benchmarkHistoryRun struct {
|
type benchmarkHistoryRun struct {
|
||||||
generatedAt time.Time
|
generatedAt time.Time
|
||||||
displayTime string
|
displayTime string
|
||||||
gpuScores map[int]float64 // GPU index → composite score
|
gpuScores map[int]float64 // GPU index → composite score
|
||||||
|
gpuStatuses map[int]string // GPU index → status ("OK", "WARNING", "FAILED", …)
|
||||||
|
overallStatus string
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderBenchmark(opts HandlerOptions) string {
|
func renderBenchmark(opts HandlerOptions) string {
|
||||||
@@ -2031,9 +2110,9 @@ func renderBenchmark(opts HandlerOptions) string {
|
|||||||
<div class="form-row">
|
<div class="form-row">
|
||||||
<label>Profile</label>
|
<label>Profile</label>
|
||||||
<select id="benchmark-profile">
|
<select id="benchmark-profile">
|
||||||
<option value="standard" selected>Standard — about 15 minutes</option>
|
<option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
|
||||||
<option value="stability">Stability — 1 to 2 hours</option>
|
<option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
|
||||||
<option value="overnight">Overnight — 8 hours</option>
|
<option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-row">
|
<div class="form-row">
|
||||||
@@ -2073,16 +2152,16 @@ func renderBenchmark(opts HandlerOptions) string {
|
|||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
|
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
|
||||||
<table>
|
<table>
|
||||||
<tr><th>Run Type</th><th>Engine</th><th>Question</th></tr>
|
<tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
|
||||||
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td></tr>
|
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
|
||||||
<tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td></tr>
|
<tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
|
||||||
</table>
|
</table>
|
||||||
<p style="font-size:12px;color:var(--muted);margin-top:10px">Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
|
<p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
|
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||||||
|
|
||||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||||
@@ -2326,7 +2405,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
|
|||||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||||
}
|
}
|
||||||
b.WriteString(`<div style="overflow-x:auto">`)
|
b.WriteString(`<div style="overflow-x:auto">`)
|
||||||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
|
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||||||
for i := 0; i <= maxGPUIndex; i++ {
|
for i := 0; i <= maxGPUIndex; i++ {
|
||||||
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||||||
}
|
}
|
||||||
@@ -2335,13 +2414,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
|
|||||||
b.WriteString(`<tr>`)
|
b.WriteString(`<tr>`)
|
||||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||||
|
overallColor := "var(--ok)"
|
||||||
|
overallLabel := run.overallStatus
|
||||||
|
if overallLabel == "" {
|
||||||
|
overallLabel = "OK"
|
||||||
|
}
|
||||||
|
if overallLabel == "FAILED" {
|
||||||
|
overallColor = "var(--crit-fg,#9f3a38)"
|
||||||
|
} else if overallLabel != "OK" {
|
||||||
|
overallColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||||||
for idx := 0; idx <= maxGPUIndex; idx++ {
|
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||||||
score, ok := run.gpuScores[idx]
|
score, ok := run.gpuScores[idx]
|
||||||
if !ok {
|
if !ok {
|
||||||
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
gpuStatus := run.gpuStatuses[idx]
|
||||||
|
scoreColor := ""
|
||||||
|
switch gpuStatus {
|
||||||
|
case "FAILED":
|
||||||
|
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||||||
|
case "WARNING", "PARTIAL":
|
||||||
|
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||||
|
case "", "OK":
|
||||||
|
// no override
|
||||||
|
default:
|
||||||
|
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||||
|
}
|
||||||
|
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||||
}
|
}
|
||||||
b.WriteString(`</tr>`)
|
b.WriteString(`</tr>`)
|
||||||
}
|
}
|
||||||
@@ -2375,12 +2477,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
run := benchmarkHistoryRun{
|
run := benchmarkHistoryRun{
|
||||||
generatedAt: result.GeneratedAt,
|
generatedAt: result.GeneratedAt,
|
||||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
gpuScores: make(map[int]float64),
|
gpuScores: make(map[int]float64),
|
||||||
|
gpuStatuses: make(map[int]string),
|
||||||
|
overallStatus: result.OverallStatus,
|
||||||
}
|
}
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||||||
|
run.gpuStatuses[gpu.Index] = gpu.Status
|
||||||
if gpu.Index > maxGPUIndex {
|
if gpu.Index > maxGPUIndex {
|
||||||
maxGPUIndex = gpu.Index
|
maxGPUIndex = gpu.Index
|
||||||
}
|
}
|
||||||
@@ -2449,31 +2554,45 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
|||||||
|
|
||||||
if len(latest.GPUs) > 0 {
|
if len(latest.GPUs) > 0 {
|
||||||
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
|
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||||
b.WriteString(`</tr></thead><tbody>`)
|
b.WriteString(`</tr></thead><tbody>`)
|
||||||
for _, gpu := range latest.GPUs {
|
for _, gpu := range latest.GPUs {
|
||||||
derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
|
// finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp,
|
||||||
|
// falling back to single-card applied limit if the ramp hasn't run.
|
||||||
|
finalLimitW := gpu.StablePowerLimitW
|
||||||
|
if finalLimitW <= 0 {
|
||||||
|
finalLimitW = gpu.AppliedPowerLimitW
|
||||||
|
}
|
||||||
|
// Derate is relative to nominal (DefaultPowerLimitW), using the final limit.
|
||||||
|
derated := gpu.Derated ||
|
||||||
|
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||||||
rowStyle := ""
|
rowStyle := ""
|
||||||
achievedStyle := ""
|
finalStyle := ""
|
||||||
if derated {
|
if derated {
|
||||||
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||||
achievedStyle = ` style="color:#e6a000;font-weight:600"`
|
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||||||
}
|
}
|
||||||
statusLabel := gpu.Status
|
statusLabel := gpu.Status
|
||||||
if statusLabel == "" {
|
if statusLabel == "" {
|
||||||
statusLabel = "OK"
|
statusLabel = "OK"
|
||||||
}
|
}
|
||||||
statusColor := "var(--ok)"
|
statusColor := "var(--ok)"
|
||||||
if statusLabel != "OK" {
|
if statusLabel == "FAILED" {
|
||||||
|
statusColor = "var(--crit-fg,#9f3a38)"
|
||||||
|
} else if statusLabel != "OK" {
|
||||||
statusColor = "var(--warn)"
|
statusColor = "var(--warn)"
|
||||||
}
|
}
|
||||||
nominalStr := "-"
|
nominalStr := "-"
|
||||||
if gpu.DefaultPowerLimitW > 0 {
|
if gpu.DefaultPowerLimitW > 0 {
|
||||||
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||||
}
|
}
|
||||||
achievedStr := "-"
|
singleStr := "-"
|
||||||
if gpu.AppliedPowerLimitW > 0 {
|
if gpu.AppliedPowerLimitW > 0 {
|
||||||
achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||||
|
}
|
||||||
|
multiStr := "-"
|
||||||
|
if gpu.StablePowerLimitW > 0 {
|
||||||
|
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||||||
}
|
}
|
||||||
p95Str := "-"
|
p95Str := "-"
|
||||||
if gpu.MaxObservedPowerW > 0 {
|
if gpu.MaxObservedPowerW > 0 {
|
||||||
@@ -2483,7 +2602,8 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
|||||||
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||||
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||||
b.WriteString(`<td>` + nominalStr + `</td>`)
|
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||||
b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
|
b.WriteString(`<td>` + singleStr + `</td>`)
|
||||||
|
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||||||
b.WriteString(`<td>` + p95Str + `</td>`)
|
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||||
b.WriteString(`</tr>`)
|
b.WriteString(`</tr>`)
|
||||||
@@ -2517,7 +2637,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
|||||||
|
|
||||||
func renderBurn() string {
|
func renderBurn() string {
|
||||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
|
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
<div class="card" style="margin-bottom:16px">
|
<div class="card" style="margin-bottom:16px">
|
||||||
@@ -2525,13 +2645,13 @@ func renderBurn() string {
|
|||||||
<div class="card-body burn-profile-body">
|
<div class="card-body burn-profile-body">
|
||||||
<div class="burn-profile-col">
|
<div class="burn-profile-col">
|
||||||
<div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
|
<div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
|
||||||
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — quick check (~5 min)</span></label>
|
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
|
||||||
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 hour</span></label>
|
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
|
||||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
|
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="burn-profile-col burn-profile-action">
|
<div class="burn-profile-col burn-profile-action">
|
||||||
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||||
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
|
<p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class="burn-profile-col burn-profile-action">
|
<div class="burn-profile-col burn-profile-action">
|
||||||
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||||
|
|||||||
@@ -575,12 +575,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
}
|
}
|
||||||
timeline := metricsTimelineSegments(samples, time.Now())
|
timeline := metricsTimelineSegments(samples, time.Now())
|
||||||
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||||
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
var overviewOk bool
|
||||||
|
var buf []byte
|
||||||
|
buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if !ok {
|
if !overviewOk {
|
||||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -589,23 +591,37 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
_, _ = w.Write(buf)
|
_, _ = w.Write(buf)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
buf, err := renderMetricChartSVG(
|
var buf []byte
|
||||||
title,
|
if stacked {
|
||||||
labels,
|
buf, err = renderStackedMetricChartSVG(
|
||||||
sampleTimes(samples),
|
title,
|
||||||
datasets,
|
labels,
|
||||||
names,
|
sampleTimes(samples),
|
||||||
yMin,
|
datasets,
|
||||||
yMax,
|
names,
|
||||||
chartCanvasHeightForPath(path, len(names)),
|
yMax,
|
||||||
timeline,
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
)
|
timeline,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
buf, err = renderMetricChartSVG(
|
||||||
|
title,
|
||||||
|
labels,
|
||||||
|
sampleTimes(samples),
|
||||||
|
datasets,
|
||||||
|
names,
|
||||||
|
yMin,
|
||||||
|
yMax,
|
||||||
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -615,12 +631,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
_, _ = w.Write(buf)
|
_, _ = w.Write(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
|
||||||
var datasets [][]float64
|
labels = sampleTimeLabels(samples)
|
||||||
var names []string
|
|
||||||
var title string
|
|
||||||
var yMin, yMax *float64
|
|
||||||
labels := sampleTimeLabels(samples)
|
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case path == "server-load":
|
case path == "server-load":
|
||||||
@@ -656,15 +668,41 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
|
|
||||||
case path == "server-power":
|
case path == "server-power":
|
||||||
title = "System Power"
|
title = "System Power"
|
||||||
power := make([]float64, len(samples))
|
// Use per-PSU stacked chart when PSU SDR data is available.
|
||||||
for i, s := range samples {
|
// Collect the union of PSU slots seen across all samples.
|
||||||
power[i] = s.PowerW
|
psuSlots := psuSlotsFromSamples(samples)
|
||||||
|
if len(psuSlots) > 1 {
|
||||||
|
// Build one dataset per PSU slot.
|
||||||
|
psuDatasets := make([][]float64, len(psuSlots))
|
||||||
|
psuNames := make([]string, len(psuSlots))
|
||||||
|
for si, slot := range psuSlots {
|
||||||
|
ds := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
for _, psu := range s.PSUs {
|
||||||
|
if psu.Slot == slot {
|
||||||
|
ds[i] = psu.PowerW
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
psuDatasets[si] = normalizePowerSeries(ds)
|
||||||
|
psuNames[si] = fmt.Sprintf("PSU %d", slot)
|
||||||
|
}
|
||||||
|
datasets = psuDatasets
|
||||||
|
names = psuNames
|
||||||
|
stacked = true
|
||||||
|
yMax = autoMax120(psuStackedTotal(psuDatasets))
|
||||||
|
} else {
|
||||||
|
power := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
power[i] = s.PowerW
|
||||||
|
}
|
||||||
|
power = normalizePowerSeries(power)
|
||||||
|
datasets = [][]float64{power}
|
||||||
|
names = []string{"Power W"}
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(power)
|
||||||
}
|
}
|
||||||
power = normalizePowerSeries(power)
|
|
||||||
datasets = [][]float64{power}
|
|
||||||
names = []string{"Power W"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(power)
|
|
||||||
|
|
||||||
case path == "server-fans":
|
case path == "server-fans":
|
||||||
title = "Fan RPM"
|
title = "Fan RPM"
|
||||||
@@ -707,7 +745,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
case strings.HasPrefix(path, "gpu/"):
|
case strings.HasPrefix(path, "gpu/"):
|
||||||
idx, sub, ok := parseGPUChartPath(path)
|
idx, sub, ok := parseGPUChartPath(path)
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
switch sub {
|
switch sub {
|
||||||
case "load":
|
case "load":
|
||||||
@@ -715,7 +753,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||||
if util == nil && mem == nil {
|
if util == nil && mem == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||||
names = []string{"Load %", "Mem %"}
|
names = []string{"Load %", "Mem %"}
|
||||||
@@ -725,7 +763,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Temperature"
|
title = gpuDisplayLabel(idx) + " Temperature"
|
||||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
if temp == nil {
|
if temp == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{temp}
|
datasets = [][]float64{temp}
|
||||||
names = []string{"Temp °C"}
|
names = []string{"Temp °C"}
|
||||||
@@ -735,7 +773,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Core Clock"
|
title = gpuDisplayLabel(idx) + " Core Clock"
|
||||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
if clock == nil {
|
if clock == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{clock}
|
datasets = [][]float64{clock}
|
||||||
names = []string{"Core Clock MHz"}
|
names = []string{"Core Clock MHz"}
|
||||||
@@ -744,7 +782,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Memory Clock"
|
title = gpuDisplayLabel(idx) + " Memory Clock"
|
||||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||||
if clock == nil {
|
if clock == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{clock}
|
datasets = [][]float64{clock}
|
||||||
names = []string{"Memory Clock MHz"}
|
names = []string{"Memory Clock MHz"}
|
||||||
@@ -753,7 +791,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Power"
|
title = gpuDisplayLabel(idx) + " Power"
|
||||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
if power == nil {
|
if power == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{power}
|
datasets = [][]float64{power}
|
||||||
names = []string{"Power W"}
|
names = []string{"Power W"}
|
||||||
@@ -761,10 +799,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
}
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
|
|
||||||
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
|
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
|
||||||
@@ -930,6 +968,37 @@ func normalizePowerSeries(ds []float64) []float64 {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
|
||||||
|
func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
|
||||||
|
seen := map[int]struct{}{}
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, p := range s.PSUs {
|
||||||
|
seen[p.Slot] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slots := make([]int, 0, len(seen))
|
||||||
|
for s := range seen {
|
||||||
|
slots = append(slots, s)
|
||||||
|
}
|
||||||
|
sort.Ints(slots)
|
||||||
|
return slots
|
||||||
|
}
|
||||||
|
|
||||||
|
// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
|
||||||
|
func psuStackedTotal(datasets [][]float64) []float64 {
|
||||||
|
if len(datasets) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
n := len(datasets[0])
|
||||||
|
total := make([]float64, n)
|
||||||
|
for _, ds := range datasets {
|
||||||
|
for i, v := range ds {
|
||||||
|
total[i] += v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
func normalizeFanSeries(ds []float64) []float64 {
|
func normalizeFanSeries(ds []float64) []float64 {
|
||||||
if len(ds) == 0 {
|
if len(ds) == 0 {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("chartDataFromSamples returned ok=false")
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -164,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("chartDataFromSamples returned ok=false")
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -209,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("gpu-all-clock returned ok=false")
|
t.Fatal("gpu-all-clock returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`NVIDIA Interconnect (NCCL)`,
|
||||||
|
`Validate and Stress:`,
|
||||||
|
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||||
|
`nvbandwidth runs all built-in tests without a time limit`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
|
|||||||
@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
|
|||||||
}
|
}
|
||||||
return gpuDisplayLabel(idx) + " Overview", buf, true
|
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||||
}
|
}
|
||||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
return "", nil, false
|
return "", nil, false
|
||||||
}
|
}
|
||||||
buf, err := renderMetricChartSVG(
|
var buf []byte
|
||||||
title,
|
var err error
|
||||||
labels,
|
if stacked {
|
||||||
sampleTimes(samples),
|
buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||||
datasets,
|
} else {
|
||||||
names,
|
buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||||
yMin,
|
}
|
||||||
yMax,
|
|
||||||
chartCanvasHeightForPath(path, len(names)),
|
|
||||||
timeline,
|
|
||||||
)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, false
|
return "", nil, false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -613,8 +613,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
}
|
}
|
||||||
a := q.opts.App
|
a := q.opts.App
|
||||||
|
|
||||||
|
recovered := len(j.lines) > 0
|
||||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||||
if len(j.lines) > 0 {
|
if recovered {
|
||||||
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -736,15 +737,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
dur := t.params.Duration
|
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
|
||||||
}
|
|
||||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
|
||||||
DurationSec: dur,
|
|
||||||
Loader: platform.NvidiaStressLoaderNCCL,
|
|
||||||
GPUIndices: t.params.GPUIndices,
|
|
||||||
}, j.append)
|
|
||||||
case "nvidia-stress":
|
case "nvidia-stress":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
|
|||||||
@@ -15,6 +15,41 @@ This applies to:
|
|||||||
- `iso/builder/config/package-lists/*.list.chroot`
|
- `iso/builder/config/package-lists/*.list.chroot`
|
||||||
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||||
|
|
||||||
|
## Bootloader sync rule
|
||||||
|
|
||||||
|
The ISO has two independent bootloader configs that must be kept in sync manually:
|
||||||
|
|
||||||
|
| File | Used by |
|
||||||
|
|------|---------|
|
||||||
|
| `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
|
||||||
|
| `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
|
||||||
|
|
||||||
|
live-build does NOT derive one from the other. Any new boot entry, kernel parameter
|
||||||
|
change, or new mode added to one file must be manually mirrored in the other.
|
||||||
|
|
||||||
|
**Canonical entry list** (both files must have all of these):
|
||||||
|
|
||||||
|
| Label | Key params |
|
||||||
|
|-------|-----------|
|
||||||
|
| normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
|
||||||
|
| load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
|
||||||
|
| GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
|
||||||
|
| KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
|
||||||
|
| KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
|
||||||
|
| fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
|
||||||
|
|
||||||
|
**Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
|
||||||
|
```
|
||||||
|
net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
|
||||||
|
numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||||
|
nowatchdog nosoftlockup
|
||||||
|
```
|
||||||
|
(fail-safe is the exception — it deliberately uses minimal params.)
|
||||||
|
|
||||||
|
**Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
|
||||||
|
live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
|
||||||
|
configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
|
||||||
|
|
||||||
## Memtest rule
|
## Memtest rule
|
||||||
|
|
||||||
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||||
|
|||||||
@@ -16,6 +16,11 @@ menuentry "EASY-BEE" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
submenu "EASY-BEE (advanced options) -->" {
|
submenu "EASY-BEE (advanced options) -->" {
|
||||||
|
menuentry "EASY-BEE — load to RAM (toram)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE — GSP=off" {
|
menuentry "EASY-BEE — GSP=off" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
@@ -26,6 +31,11 @@ submenu "EASY-BEE (advanced options) -->" {
|
|||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE — KMS + GSP=off" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE — fail-safe" {
|
menuentry "EASY-BEE — fail-safe" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
|
|||||||
menu default
|
menu default
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-kms
|
|
||||||
menu label EASY-BEE (^graphics/KMS)
|
|
||||||
linux @LINUX@
|
|
||||||
initrd @INITRD@
|
|
||||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
label live-@FLAVOUR@-toram
|
||||||
menu label EASY-BEE (^load to RAM)
|
menu label EASY-BEE (^load to RAM)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-gsp-off
|
label live-@FLAVOUR@-gsp-off
|
||||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-kms-gsp-off
|
label live-@FLAVOUR@-kms
|
||||||
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
menu label EASY-BEE (^KMS, no nomodeset)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms-gsp-off
|
||||||
|
menu label EASY-BEE (KMS, ^GSP=off)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-failsafe
|
label live-@FLAVOUR@-failsafe
|
||||||
menu label EASY-BEE (^fail-safe)
|
menu label EASY-BEE (^fail-safe)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
|
|
||||||
label memtest
|
label memtest
|
||||||
menu label ^Memory Test (memtest86+)
|
menu label ^Memory Test (memtest86+)
|
||||||
|
|||||||
@@ -63,8 +63,10 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
|||||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
|||||||
@@ -1,117 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
|
|
||||||
set -e
|
|
||||||
echo "=== generating bee wallpaper ==="
|
|
||||||
mkdir -p /usr/share/bee
|
|
||||||
|
|
||||||
python3 - <<'PYEOF'
|
|
||||||
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
|
||||||
import os
|
|
||||||
|
|
||||||
W, H = 1920, 1080
|
|
||||||
|
|
||||||
ASCII_ART = [
|
|
||||||
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
|
|
||||||
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
|
|
||||||
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
|
|
||||||
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
|
|
||||||
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
|
|
||||||
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
|
|
||||||
]
|
|
||||||
SUBTITLE = " Hardware Audit LiveCD"
|
|
||||||
|
|
||||||
FG = (0xF6, 0xD0, 0x47)
|
|
||||||
FG_DIM = (0xD4, 0xA9, 0x1C)
|
|
||||||
SHADOW = (0x5E, 0x47, 0x05)
|
|
||||||
SUB = (0x96, 0x7A, 0x17)
|
|
||||||
BG = (0x05, 0x05, 0x05)
|
|
||||||
|
|
||||||
MONO_FONT_CANDIDATES = [
|
|
||||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
|
|
||||||
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
|
|
||||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
|
|
||||||
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
|
|
||||||
]
|
|
||||||
SUB_FONT_CANDIDATES = [
|
|
||||||
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
|
||||||
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
|
||||||
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
|
||||||
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def load_font(candidates, size):
|
|
||||||
for path in candidates:
|
|
||||||
if os.path.exists(path):
|
|
||||||
return ImageFont.truetype(path, size)
|
|
||||||
return ImageFont.load_default()
|
|
||||||
|
|
||||||
|
|
||||||
def mono_metrics(font):
|
|
||||||
probe = Image.new('L', (W, H), 0)
|
|
||||||
draw = ImageDraw.Draw(probe)
|
|
||||||
char_w = int(round(draw.textlength("M", font=font)))
|
|
||||||
bb = draw.textbbox((0, 0), "Mg", font=font)
|
|
||||||
char_h = bb[3] - bb[1]
|
|
||||||
return char_w, char_h
|
|
||||||
|
|
||||||
|
|
||||||
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
|
|
||||||
width = max(len(line) for line in lines) * char_w
|
|
||||||
height = len(lines) * char_h + line_gap * (len(lines) - 1)
|
|
||||||
mask = Image.new('L', (width, height), 0)
|
|
||||||
draw = ImageDraw.Draw(mask)
|
|
||||||
for row, line in enumerate(lines):
|
|
||||||
y = row * (char_h + line_gap)
|
|
||||||
for col, ch in enumerate(line):
|
|
||||||
if ch == ' ':
|
|
||||||
continue
|
|
||||||
x = col * char_w
|
|
||||||
draw.text((x, y), ch, font=font, fill=255)
|
|
||||||
return mask
|
|
||||||
|
|
||||||
|
|
||||||
img = Image.new('RGB', (W, H), BG)
|
|
||||||
draw = ImageDraw.Draw(img)
|
|
||||||
|
|
||||||
# Soft amber glow under the logo without depending on font rendering.
|
|
||||||
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
|
|
||||||
glow_draw = ImageDraw.Draw(glow)
|
|
||||||
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
|
|
||||||
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
|
||||||
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
|
||||||
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
|
||||||
|
|
||||||
TARGET_LOGO_W = 400
|
|
||||||
max_chars = max(len(line) for line in ASCII_ART)
|
|
||||||
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
|
|
||||||
_probe_cw, _ = mono_metrics(_probe_font)
|
|
||||||
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
|
|
||||||
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
|
|
||||||
char_w, char_h = mono_metrics(font_logo)
|
|
||||||
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
|
|
||||||
logo_w, logo_h = logo_mask.size
|
|
||||||
logo_x = (W - logo_w) // 2
|
|
||||||
logo_y = 380
|
|
||||||
|
|
||||||
sh_off = max(1, font_size_logo // 6)
|
|
||||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
|
|
||||||
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
|
|
||||||
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
|
|
||||||
img.paste(FG, (logo_x, logo_y), logo_mask)
|
|
||||||
|
|
||||||
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
|
||||||
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
|
||||||
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
|
||||||
sub_y = logo_y + logo_h + 48
|
|
||||||
draw = ImageDraw.Draw(img)
|
|
||||||
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
|
||||||
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
|
||||||
|
|
||||||
img = img.convert('RGB')
|
|
||||||
|
|
||||||
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
|
||||||
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
|
||||||
PYEOF
|
|
||||||
|
|
||||||
echo "=== wallpaper done ==="
|
|
||||||
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9011-toram-rsync.hook.chroot
|
||||||
|
#
|
||||||
|
# Adds rsync to the initramfs so that live-boot's toram code takes the
|
||||||
|
# rsync --progress path instead of the silent "cp -a" fallback.
|
||||||
|
#
|
||||||
|
# live-boot's 9990-toram-todisk.sh already contains:
|
||||||
|
# if [ -x /bin/rsync ]; then
|
||||||
|
# rsync -a --progress ... 1>/dev/console
|
||||||
|
# else
|
||||||
|
# cp -a ... # no output
|
||||||
|
# fi
|
||||||
|
#
|
||||||
|
# We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
|
||||||
|
# which copies the binary + all shared-library dependencies into the initrd.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
HOOK_DIR="/etc/initramfs-tools/hooks"
|
||||||
|
HOOK="${HOOK_DIR}/bee-rsync"
|
||||||
|
|
||||||
|
mkdir -p "${HOOK_DIR}"
|
||||||
|
|
||||||
|
cat > "${HOOK}" << 'EOF'
|
||||||
|
#!/bin/sh
|
||||||
|
# initramfs hook: include rsync for live-boot toram progress output
|
||||||
|
PREREQ=""
|
||||||
|
prereqs() { echo "$PREREQ"; }
|
||||||
|
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||||
|
|
||||||
|
. /usr/share/initramfs-tools/hook-functions
|
||||||
|
|
||||||
|
if [ -x /usr/bin/rsync ]; then
|
||||||
|
copy_exec /usr/bin/rsync /bin
|
||||||
|
fi
|
||||||
|
EOF
|
||||||
|
|
||||||
|
chmod +x "${HOOK}"
|
||||||
|
|
||||||
|
echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
|
||||||
|
|
||||||
|
# Rebuild initramfs so the hook takes effect in the ISO's initrd.img
|
||||||
|
KVER=$(ls /lib/modules | sort -V | tail -1)
|
||||||
|
echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
|
||||||
|
update-initramfs -u -k "${KVER}"
|
||||||
|
echo "9011-toram-rsync: done"
|
||||||
@@ -3,6 +3,7 @@ dmidecode
|
|||||||
smartmontools
|
smartmontools
|
||||||
nvme-cli
|
nvme-cli
|
||||||
pciutils
|
pciutils
|
||||||
|
rsync
|
||||||
ipmitool
|
ipmitool
|
||||||
util-linux
|
util-linux
|
||||||
e2fsprogs
|
e2fsprogs
|
||||||
|
|||||||
@@ -65,6 +65,9 @@ done
|
|||||||
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
||||||
if [ ! -f "$SQUASHFS" ]; then
|
if [ ! -f "$SQUASHFS" ]; then
|
||||||
echo "ERROR: squashfs not found at $SQUASHFS" >&2
|
echo "ERROR: squashfs not found at $SQUASHFS" >&2
|
||||||
|
echo " The live medium may have been disconnected." >&2
|
||||||
|
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
|
||||||
|
echo " Then re-run bee-install." >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -162,10 +165,59 @@ log " Mounted."
|
|||||||
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
||||||
log " Source: $SQUASHFS"
|
log " Source: $SQUASHFS"
|
||||||
log " Target: $MOUNT_ROOT"
|
log " Target: $MOUNT_ROOT"
|
||||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
|
||||||
grep -E '^\[|^inod|^created|^extract' | \
|
# unsquashfs does not support resume, so retry the entire unpack step if the
|
||||||
while read -r line; do log " $line"; done || true
|
# source medium disappears mid-copy (e.g. CD physically disconnected).
|
||||||
log " Unpack complete."
|
UNPACK_ATTEMPTS=0
|
||||||
|
UNPACK_MAX=5
|
||||||
|
while true; do
|
||||||
|
UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
|
||||||
|
if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
|
||||||
|
die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
|
||||||
|
fi
|
||||||
|
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
|
||||||
|
|
||||||
|
# Re-check squashfs is reachable before each attempt
|
||||||
|
if [ ! -f "$SQUASHFS" ]; then
|
||||||
|
log " SOURCE LOST: $SQUASHFS not found."
|
||||||
|
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
|
||||||
|
log " then press Enter here to retry."
|
||||||
|
read -r _
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# wipe partial unpack so unsquashfs starts clean
|
||||||
|
if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
|
||||||
|
log " Cleaning partial unpack from $MOUNT_ROOT ..."
|
||||||
|
# keep the mount point itself but remove its contents
|
||||||
|
find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
UNPACK_OK=0
|
||||||
|
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
||||||
|
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
||||||
|
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
||||||
|
|
||||||
|
# Check squashfs is still reachable (gone = disc pulled during copy)
|
||||||
|
if [ ! -f "$SQUASHFS" ]; then
|
||||||
|
log " WARNING: source medium lost during unpack — will retry after remount."
|
||||||
|
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
|
||||||
|
read -r _
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify the unpack produced a usable root (presence of /etc is a basic check)
|
||||||
|
if [ -d "${MOUNT_ROOT}/etc" ]; then
|
||||||
|
log " Unpack complete."
|
||||||
|
break
|
||||||
|
else
|
||||||
|
log " WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
|
||||||
|
if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
|
||||||
|
log " Retrying in 5 s ..."
|
||||||
|
sleep 5
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
log "--- Step 6/7: Configuring installed system ---"
|
log "--- Step 6/7: Configuring installed system ---"
|
||||||
|
|||||||
100
iso/overlay/usr/local/bin/bee-remount-medium
Normal file
100
iso/overlay/usr/local/bin/bee-remount-medium
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
|
||||||
|
#
|
||||||
|
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
|
||||||
|
# was lost and /run/live/medium/live/filesystem.squashfs is missing.
|
||||||
|
#
|
||||||
|
# Usage: bee-remount-medium [--wait]
|
||||||
|
# --wait keep retrying every 5 seconds until the medium is found (useful
|
||||||
|
# while physically reconnecting the device)
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
MEDIUM_DIR="/run/live/medium"
|
||||||
|
SQUASHFS_REL="live/filesystem.squashfs"
|
||||||
|
WAIT_MODE=0
|
||||||
|
|
||||||
|
for arg in "$@"; do
|
||||||
|
case "$arg" in
|
||||||
|
--wait|-w) WAIT_MODE=1 ;;
|
||||||
|
--help|-h)
|
||||||
|
echo "Usage: bee-remount-medium [--wait]"
|
||||||
|
echo " Finds and remounts the live ISO medium to $MEDIUM_DIR"
|
||||||
|
echo " --wait retry every 5 s until a medium with squashfs is found"
|
||||||
|
exit 0 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||||
|
die() { log "ERROR: $*" >&2; exit 1; }
|
||||||
|
|
||||||
|
# Return all candidate block devices (optical + removable USB mass storage)
|
||||||
|
find_candidates() {
|
||||||
|
# CD/DVD drives
|
||||||
|
for dev in /dev/sr* /dev/scd*; do
|
||||||
|
[ -b "$dev" ] && echo "$dev"
|
||||||
|
done
|
||||||
|
# USB/removable disks and partitions
|
||||||
|
for dev in /dev/sd* /dev/vd*; do
|
||||||
|
[ -b "$dev" ] || continue
|
||||||
|
# Only whole disks or partitions — skip the same device we are running from
|
||||||
|
local removable
|
||||||
|
local base
|
||||||
|
base=$(basename "$dev")
|
||||||
|
removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
|
||||||
|
[ "$removable" = "1" ] && echo "$dev"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# Try to mount $1 to $MEDIUM_DIR and check for squashfs
|
||||||
|
try_mount() {
|
||||||
|
local dev="$1"
|
||||||
|
local tmpdir
|
||||||
|
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
|
||||||
|
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
|
||||||
|
if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
|
||||||
|
# Unmount probe mount and mount properly onto live path
|
||||||
|
umount "$tmpdir" 2>/dev/null || true
|
||||||
|
rmdir "$tmpdir" 2>/dev/null || true
|
||||||
|
# Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
|
||||||
|
umount "$MEDIUM_DIR" 2>/dev/null || true
|
||||||
|
mkdir -p "$MEDIUM_DIR"
|
||||||
|
if mount -o ro "$dev" "$MEDIUM_DIR"; then
|
||||||
|
log "Mounted $dev on $MEDIUM_DIR"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
log "Mount of $dev on $MEDIUM_DIR failed"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
umount "$tmpdir" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
rmdir "$tmpdir" 2>/dev/null || true
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
attempt() {
|
||||||
|
log "Scanning for ISO medium..."
|
||||||
|
for dev in $(find_candidates); do
|
||||||
|
log " Trying $dev ..."
|
||||||
|
if try_mount "$dev"; then
|
||||||
|
local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
|
||||||
|
log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ "$WAIT_MODE" = "1" ]; then
|
||||||
|
log "Waiting for live medium (press Ctrl+C to abort)..."
|
||||||
|
while true; do
|
||||||
|
if attempt; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
log " Not found — retrying in 5 s (reconnect the disc now)"
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
else
|
||||||
|
attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user