Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5ba72ab315 | |||
| 63363e9629 | |||
|
|
5285c0d101 | ||
|
|
dca4afb8d0 | ||
|
|
b4280941f5 | ||
|
|
f74976ec4c |
@@ -146,7 +146,7 @@ type satRunner interface {
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
|
||||
@@ -128,6 +128,7 @@ type fakeSAT struct {
|
||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||
runNCCLFn func(string, []int) (string, error)
|
||||
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNCCLFn != nil {
|
||||
return f.runNCCLFn(baseDir, gpuIndices)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var gotBaseDir string
|
||||
var gotGPUIndices []int
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
|
||||
gotBaseDir = baseDir
|
||||
gotGPUIndices = append([]int(nil), gpuIndices...)
|
||||
return "/tmp/nccl-tests.tar.gz", nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("RunNCCLTests error: %v", err)
|
||||
}
|
||||
if path != "/tmp/nccl-tests.tar.gz" {
|
||||
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
|
||||
}
|
||||
if gotBaseDir != "/tmp/sat" {
|
||||
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
|
||||
}
|
||||
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
|
||||
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -59,6 +59,9 @@ type benchmarkPowerCalibrationResult struct {
|
||||
// ≥20% while server fans were below 100% duty cycle — a signal that the
|
||||
// cooling system may not be correctly configured for full GPU load.
|
||||
CoolingWarning string
|
||||
// MetricRows holds the telemetry rows from the final (converged) attempt
|
||||
// for this GPU. Used to build per-run gpu-metrics.csv.
|
||||
MetricRows []GPUMetricRow
|
||||
}
|
||||
|
||||
type benchmarkBurnProfile struct {
|
||||
@@ -2781,7 +2784,7 @@ func runBenchmarkPowerCalibration(
|
||||
infoByIndex map[int]benchmarkGPUInfo,
|
||||
logFunc func(string),
|
||||
seedLimits map[int]int,
|
||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
|
||||
const calibDurationSec = 120
|
||||
const maxDerateW = 150
|
||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||
@@ -2795,7 +2798,7 @@ func runBenchmarkPowerCalibration(
|
||||
|
||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||
return map[int]benchmarkPowerCalibrationResult{}, nil
|
||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||
}
|
||||
if killed := KillTestWorkers(); len(killed) > 0 {
|
||||
for _, p := range killed {
|
||||
@@ -2829,6 +2832,8 @@ func runBenchmarkPowerCalibration(
|
||||
|
||||
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
|
||||
var restore []benchmarkRestoreAction
|
||||
var allCalibRows []GPUMetricRow // accumulated telemetry across all attempts
|
||||
var calibCursor float64
|
||||
|
||||
// Initialise per-GPU state.
|
||||
states := make([]*gpuCalibState, 0, len(gpuIndices))
|
||||
@@ -2981,6 +2986,8 @@ calibDone:
|
||||
ticker.Stop()
|
||||
cancelAttempt()
|
||||
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
||||
// Accumulate telemetry rows with attempt stage label.
|
||||
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
|
||||
|
||||
// Resource busy: retry with exponential back-off (shared — one DCGM session).
|
||||
if ar.err != nil && isDCGMResourceBusy(ar.err) {
|
||||
@@ -3065,6 +3072,7 @@ calibDone:
|
||||
}
|
||||
}
|
||||
}
|
||||
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
@@ -3103,6 +3111,7 @@ calibDone:
|
||||
} else {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
}
|
||||
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
@@ -3140,7 +3149,8 @@ calibDone:
|
||||
results[s.idx] = s.calib
|
||||
}
|
||||
}
|
||||
return results, restore
|
||||
writeBenchmarkMetricsFiles(runDir, allCalibRows)
|
||||
return results, restore, allCalibRows
|
||||
}
|
||||
|
||||
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
||||
@@ -3230,21 +3240,25 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
if len(result.RampSteps) > 0 {
|
||||
b.WriteString("## Ramp Sequence\n\n")
|
||||
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
|
||||
b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
|
||||
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
|
||||
b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
|
||||
for _, step := range result.RampSteps {
|
||||
derated := "-"
|
||||
if step.Derated {
|
||||
derated = "⚠ yes"
|
||||
}
|
||||
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
|
||||
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
|
||||
serverDelta := "-"
|
||||
if step.ServerDeltaW > 0 {
|
||||
serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
|
||||
}
|
||||
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
|
||||
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
b.WriteString("## Per-Slot Results\n\n")
|
||||
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
|
||||
b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
|
||||
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
|
||||
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
stableLimit := "-"
|
||||
if gpu.StablePowerLimitW > 0 {
|
||||
@@ -3254,8 +3268,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
|
||||
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
|
||||
serverDelta := "-"
|
||||
if gpu.ServerDeltaW > 0 {
|
||||
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
|
||||
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
@@ -3284,11 +3302,19 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
|
||||
fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
|
||||
fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
|
||||
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
|
||||
if step.ServerLoadedW > 0 {
|
||||
fmt.Fprintf(&b, "ramp_step_%d_server_loaded_w=%.0f\n", step.StepIndex, step.ServerLoadedW)
|
||||
fmt.Fprintf(&b, "ramp_step_%d_server_delta_w=%.0f\n", step.StepIndex, step.ServerDeltaW)
|
||||
}
|
||||
}
|
||||
for _, gpu := range result.GPUs {
|
||||
if gpu.StablePowerLimitW > 0 {
|
||||
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
|
||||
}
|
||||
if gpu.ServerLoadedW > 0 {
|
||||
fmt.Fprintf(&b, "gpu_%d_server_loaded_w=%.0f\n", gpu.Index, gpu.ServerLoadedW)
|
||||
fmt.Fprintf(&b, "gpu_%d_server_delta_w=%.0f\n", gpu.Index, gpu.ServerDeltaW)
|
||||
}
|
||||
}
|
||||
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
|
||||
@@ -3327,6 +3353,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
if infoErr != nil {
|
||||
return "", infoErr
|
||||
}
|
||||
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||
}
|
||||
hostname, _ := os.Hostname()
|
||||
result := NvidiaPowerBenchResult{
|
||||
BenchmarkVersion: benchmarkVersion,
|
||||
@@ -3352,13 +3382,31 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||
singleIPMILoadedW := make(map[int]float64, len(selected))
|
||||
var allRestoreActions []benchmarkRestoreAction
|
||||
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
|
||||
var allPowerRows []GPUMetricRow
|
||||
var powerCursor float64
|
||||
for _, idx := range selected {
|
||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||
_ = os.MkdirAll(singleDir, 0755)
|
||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
|
||||
ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
|
||||
ipmiSingleDone := make(chan float64, 1)
|
||||
go func() {
|
||||
defer close(ipmiSingleDone)
|
||||
if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
|
||||
ipmiSingleDone <- w
|
||||
}
|
||||
}()
|
||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
|
||||
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||
ipmiSingleCancel()
|
||||
if w, ok := <-ipmiSingleDone; ok {
|
||||
singleIPMILoadedW[idx] = w
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W", idx, w))
|
||||
}
|
||||
allRestoreActions = append(allRestoreActions, restore...)
|
||||
if r, ok := c[idx]; ok {
|
||||
calibByIndex[idx] = r
|
||||
@@ -3383,7 +3431,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
result.OverallStatus = "PARTIAL"
|
||||
}
|
||||
}
|
||||
gpus = append(gpus, NvidiaPowerBenchGPU{
|
||||
gpu := NvidiaPowerBenchGPU{
|
||||
Index: idx,
|
||||
Name: info.Name,
|
||||
BusID: info.BusID,
|
||||
@@ -3396,7 +3444,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
Status: status,
|
||||
Notes: append([]string(nil), calib.Notes...),
|
||||
CoolingWarning: calib.CoolingWarning,
|
||||
})
|
||||
}
|
||||
if w, ok := singleIPMILoadedW[idx]; ok && serverIdleOK && w > 0 {
|
||||
gpu.ServerLoadedW = w
|
||||
gpu.ServerDeltaW = w - serverIdleW
|
||||
}
|
||||
if len(calib.MetricRows) > 0 {
|
||||
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
||||
gpu.Telemetry = &t
|
||||
}
|
||||
gpus = append(gpus, gpu)
|
||||
}
|
||||
sort.Slice(gpus, func(i, j int) bool {
|
||||
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
|
||||
@@ -3445,20 +3502,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
|
||||
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
|
||||
|
||||
// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
|
||||
// server-side loaded power while GPUs are under stress. The goroutine is
|
||||
// cancelled as soon as Phase 2 finishes, and the average is used to compare
|
||||
// against PlatformMaxTDPW (GPU-reported stable limits sum).
|
||||
// serverLoadedW tracks the IPMI server power from the final ramp step
|
||||
// (all GPUs simultaneously loaded). Earlier steps' values are stored
|
||||
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
|
||||
var serverLoadedW float64
|
||||
var serverLoadedOK bool
|
||||
ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
|
||||
ipmiPhase2Done := make(chan float64, 1)
|
||||
go func() {
|
||||
defer close(ipmiPhase2Done)
|
||||
if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
|
||||
ipmiPhase2Done <- w
|
||||
}
|
||||
}()
|
||||
|
||||
// Step 1: reuse single-card calibration result directly.
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
@@ -3475,6 +3523,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
Derated: firstCalib.Derated,
|
||||
Status: "OK",
|
||||
}
|
||||
if w, ok := singleIPMILoadedW[firstIdx]; ok && serverIdleOK && w > 0 {
|
||||
ramp.ServerLoadedW = w
|
||||
ramp.ServerDeltaW = w - serverIdleW
|
||||
}
|
||||
if !firstCalib.Completed {
|
||||
ramp.Status = "FAILED"
|
||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
|
||||
@@ -3502,17 +3554,45 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
_ = os.MkdirAll(stepDir, 0755)
|
||||
|
||||
// Reuse the latest stable limits as starting points, but re-check every
|
||||
// active GPU in this hotter configuration.
|
||||
seedForStep := make(map[int]int, len(stableLimits))
|
||||
for k, v := range stableLimits {
|
||||
seedForStep[k] = v
|
||||
// active GPU in this hotter configuration. For the newly introduced GPU,
|
||||
// seed from its single-card calibration so we do not restart from the
|
||||
// default TDP when a prior derated limit is already known.
|
||||
seedForStep := make(map[int]int, len(subset))
|
||||
for _, idx := range subset {
|
||||
if lim, ok := stableLimits[idx]; ok && lim > 0 {
|
||||
seedForStep[idx] = lim
|
||||
continue
|
||||
}
|
||||
if base, ok := calibByIndex[idx]; ok {
|
||||
lim := int(math.Round(base.AppliedPowerLimitW))
|
||||
if lim > 0 {
|
||||
seedForStep[idx] = lim
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
|
||||
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
||||
|
||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
|
||||
ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
|
||||
ipmiStepDone := make(chan float64, 1)
|
||||
go func() {
|
||||
defer close(ipmiStepDone)
|
||||
if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
|
||||
ipmiStepDone <- w
|
||||
}
|
||||
}()
|
||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
|
||||
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||
ipmiStepCancel()
|
||||
var stepIPMILoadedW float64
|
||||
var stepIPMIOK bool
|
||||
if w, ok := <-ipmiStepDone; ok {
|
||||
stepIPMILoadedW = w
|
||||
stepIPMIOK = true
|
||||
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
|
||||
}
|
||||
// Accumulate restore actions; they all run in the outer defer.
|
||||
allRestoreActions = append(allRestoreActions, stepRestore...)
|
||||
|
||||
@@ -3575,15 +3655,17 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||
}
|
||||
|
||||
result.RampSteps = append(result.RampSteps, ramp)
|
||||
}
|
||||
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
||||
ramp.ServerLoadedW = stepIPMILoadedW
|
||||
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
||||
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
||||
if step == len(result.RecommendedSlotOrder) {
|
||||
serverLoadedW = stepIPMILoadedW
|
||||
serverLoadedOK = true
|
||||
}
|
||||
}
|
||||
|
||||
// Stop IPMI Phase 2 sampling and collect result.
|
||||
ipmiPhase2Cancel()
|
||||
if w, ok := <-ipmiPhase2Done; ok {
|
||||
serverLoadedW = w
|
||||
serverLoadedOK = true
|
||||
logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
|
||||
result.RampSteps = append(result.RampSteps, ramp)
|
||||
}
|
||||
|
||||
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
|
||||
@@ -3613,6 +3695,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
||||
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
||||
// Write top-level gpu-metrics.csv/.html aggregating all phases.
|
||||
writeBenchmarkMetricsFiles(runDir, allPowerRows)
|
||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("marshal power result: %w", err)
|
||||
|
||||
@@ -331,6 +331,13 @@ type NvidiaPowerBenchGPU struct {
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// Telemetry holds the aggregated stats from the final converged calibration
|
||||
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchStep struct {
|
||||
@@ -345,6 +352,10 @@ type NvidiaPowerBenchStep struct {
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
}
|
||||
|
||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||
|
||||
@@ -140,26 +140,56 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
if err != nil || len(squashfsFiles) == 0 {
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
||||
}
|
||||
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
}
|
||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||
|
||||
dstDir := installToRAMDir
|
||||
|
||||
// If the source medium is unavailable, check whether a previous run already
|
||||
// produced a complete copy in RAM. If so, skip the copy phase and proceed
|
||||
// directly to the loop-rebind / bind-mount steps.
|
||||
if !sourceAvailable {
|
||||
copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||
if len(copiedFiles) > 0 {
|
||||
log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
|
||||
// Proceed to rebind with the already-copied files.
|
||||
for _, dst := range copiedFiles {
|
||||
base := filepath.Base(dst)
|
||||
// Re-associate the loop device that was originally backed by the
|
||||
// source file (now gone); find it by the old source path pattern.
|
||||
srcGuess := "/run/live/medium/live/" + base
|
||||
loopDev, lerr := findLoopForFile(srcGuess)
|
||||
if lerr != nil {
|
||||
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
|
||||
continue
|
||||
}
|
||||
if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
|
||||
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
|
||||
} else {
|
||||
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||
}
|
||||
}
|
||||
goto bindMedium
|
||||
}
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
||||
}
|
||||
|
||||
{
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
}
|
||||
}
|
||||
|
||||
if state.CopyPresent {
|
||||
log("Removing stale partial RAM copy before retry...")
|
||||
}
|
||||
@@ -199,6 +229,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
||||
}
|
||||
}
|
||||
|
||||
bindMedium:
|
||||
log("Copying remaining medium files...")
|
||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||
|
||||
@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
// detect GPU count
|
||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
gpuCount := len(selected)
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, env: nvidiaVisibleDevicesEnv(selected)},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
|
||||
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
|
||||
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
|
||||
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
|
||||
if len(cmd) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||
}
|
||||
for i := range want {
|
||||
if cmd[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||
if len(env) != 2 {
|
||||
|
||||
@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
@@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
@@ -1527,8 +1527,6 @@ function satModeChanged() {
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
|
||||
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
@@ -1776,7 +1774,7 @@ function runAllSAT() {
|
||||
const cycles = 1;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const activeTargets = baseTargets.filter(target => {
|
||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||
@@ -2016,9 +2014,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
||||
|
||||
type benchmarkHistoryRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
gpuScores map[int]float64 // GPU index → composite score
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
gpuScores map[int]float64 // GPU index → composite score
|
||||
gpuStatuses map[int]string // GPU index → status ("OK", "WARNING", "FAILED", …)
|
||||
overallStatus string
|
||||
}
|
||||
|
||||
func renderBenchmark(opts HandlerOptions) string {
|
||||
@@ -2082,7 +2082,7 @@ func renderBenchmark(opts HandlerOptions) string {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
|
||||
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||||
|
||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||
@@ -2326,7 +2326,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
|
||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<div style="overflow-x:auto">`)
|
||||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
|
||||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||||
for i := 0; i <= maxGPUIndex; i++ {
|
||||
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||||
}
|
||||
@@ -2335,13 +2335,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
overallColor := "var(--ok)"
|
||||
overallLabel := run.overallStatus
|
||||
if overallLabel == "" {
|
||||
overallLabel = "OK"
|
||||
}
|
||||
if overallLabel == "FAILED" {
|
||||
overallColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if overallLabel != "OK" {
|
||||
overallColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||||
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||||
score, ok := run.gpuScores[idx]
|
||||
if !ok {
|
||||
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||
continue
|
||||
}
|
||||
b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||
gpuStatus := run.gpuStatuses[idx]
|
||||
scoreColor := ""
|
||||
switch gpuStatus {
|
||||
case "FAILED":
|
||||
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||||
case "WARNING", "PARTIAL":
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
case "", "OK":
|
||||
// no override
|
||||
default:
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
}
|
||||
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||
}
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
@@ -2375,12 +2398,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
|
||||
continue
|
||||
}
|
||||
run := benchmarkHistoryRun{
|
||||
generatedAt: result.GeneratedAt,
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
gpuScores: make(map[int]float64),
|
||||
generatedAt: result.GeneratedAt,
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
gpuScores: make(map[int]float64),
|
||||
gpuStatuses: make(map[int]string),
|
||||
overallStatus: result.OverallStatus,
|
||||
}
|
||||
for _, gpu := range result.GPUs {
|
||||
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||||
run.gpuStatuses[gpu.Index] = gpu.Status
|
||||
if gpu.Index > maxGPUIndex {
|
||||
maxGPUIndex = gpu.Index
|
||||
}
|
||||
@@ -2449,31 +2475,45 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
|
||||
if len(latest.GPUs) > 0 {
|
||||
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for _, gpu := range latest.GPUs {
|
||||
derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
|
||||
// finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp,
|
||||
// falling back to single-card applied limit if the ramp hasn't run.
|
||||
finalLimitW := gpu.StablePowerLimitW
|
||||
if finalLimitW <= 0 {
|
||||
finalLimitW = gpu.AppliedPowerLimitW
|
||||
}
|
||||
// Derate is relative to nominal (DefaultPowerLimitW), using the final limit.
|
||||
derated := gpu.Derated ||
|
||||
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||||
rowStyle := ""
|
||||
achievedStyle := ""
|
||||
finalStyle := ""
|
||||
if derated {
|
||||
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||
achievedStyle = ` style="color:#e6a000;font-weight:600"`
|
||||
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||||
}
|
||||
statusLabel := gpu.Status
|
||||
if statusLabel == "" {
|
||||
statusLabel = "OK"
|
||||
}
|
||||
statusColor := "var(--ok)"
|
||||
if statusLabel != "OK" {
|
||||
if statusLabel == "FAILED" {
|
||||
statusColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if statusLabel != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
nominalStr := "-"
|
||||
if gpu.DefaultPowerLimitW > 0 {
|
||||
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||
}
|
||||
achievedStr := "-"
|
||||
singleStr := "-"
|
||||
if gpu.AppliedPowerLimitW > 0 {
|
||||
achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||
}
|
||||
multiStr := "-"
|
||||
if gpu.StablePowerLimitW > 0 {
|
||||
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||||
}
|
||||
p95Str := "-"
|
||||
if gpu.MaxObservedPowerW > 0 {
|
||||
@@ -2483,7 +2523,8 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||
b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
|
||||
b.WriteString(`<td>` + singleStr + `</td>`)
|
||||
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||||
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
@@ -2517,7 +2558,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
|
||||
@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA Interconnect (NCCL)`,
|
||||
`Runs in Validate and Stress.`,
|
||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||
`Intended to stay short enough for Validate.`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
@@ -613,8 +613,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
}
|
||||
a := q.opts.App
|
||||
|
||||
recovered := len(j.lines) > 0
|
||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||
if len(j.lines) > 0 {
|
||||
if recovered {
|
||||
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||
}
|
||||
|
||||
@@ -736,15 +737,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: platform.NvidiaStressLoaderNCCL,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
}, j.append)
|
||||
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||
case "nvidia-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
|
||||
@@ -16,6 +16,11 @@ menuentry "EASY-BEE" {
|
||||
}
|
||||
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — load to RAM (toram)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
|
||||
@@ -63,8 +63,10 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
#!/bin/sh
|
||||
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
|
||||
set -e
|
||||
echo "=== generating bee wallpaper ==="
|
||||
mkdir -p /usr/share/bee
|
||||
|
||||
python3 - <<'PYEOF'
|
||||
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
||||
import os
|
||||
|
||||
W, H = 1920, 1080
|
||||
|
||||
ASCII_ART = [
|
||||
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
|
||||
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
|
||||
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
|
||||
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
|
||||
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
|
||||
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
|
||||
]
|
||||
SUBTITLE = " Hardware Audit LiveCD"
|
||||
|
||||
FG = (0xF6, 0xD0, 0x47)
|
||||
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||
SHADOW = (0x5E, 0x47, 0x05)
|
||||
SUB = (0x96, 0x7A, 0x17)
|
||||
BG = (0x05, 0x05, 0x05)
|
||||
|
||||
MONO_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
|
||||
]
|
||||
SUB_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
|
||||
]
|
||||
|
||||
|
||||
def load_font(candidates, size):
|
||||
for path in candidates:
|
||||
if os.path.exists(path):
|
||||
return ImageFont.truetype(path, size)
|
||||
return ImageFont.load_default()
|
||||
|
||||
|
||||
def mono_metrics(font):
|
||||
probe = Image.new('L', (W, H), 0)
|
||||
draw = ImageDraw.Draw(probe)
|
||||
char_w = int(round(draw.textlength("M", font=font)))
|
||||
bb = draw.textbbox((0, 0), "Mg", font=font)
|
||||
char_h = bb[3] - bb[1]
|
||||
return char_w, char_h
|
||||
|
||||
|
||||
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
|
||||
width = max(len(line) for line in lines) * char_w
|
||||
height = len(lines) * char_h + line_gap * (len(lines) - 1)
|
||||
mask = Image.new('L', (width, height), 0)
|
||||
draw = ImageDraw.Draw(mask)
|
||||
for row, line in enumerate(lines):
|
||||
y = row * (char_h + line_gap)
|
||||
for col, ch in enumerate(line):
|
||||
if ch == ' ':
|
||||
continue
|
||||
x = col * char_w
|
||||
draw.text((x, y), ch, font=font, fill=255)
|
||||
return mask
|
||||
|
||||
|
||||
img = Image.new('RGB', (W, H), BG)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Soft amber glow under the logo without depending on font rendering.
|
||||
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
|
||||
glow_draw = ImageDraw.Draw(glow)
|
||||
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
|
||||
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||
|
||||
TARGET_LOGO_W = 400
|
||||
max_chars = max(len(line) for line in ASCII_ART)
|
||||
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
|
||||
_probe_cw, _ = mono_metrics(_probe_font)
|
||||
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
|
||||
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
|
||||
char_w, char_h = mono_metrics(font_logo)
|
||||
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
|
||||
logo_w, logo_h = logo_mask.size
|
||||
logo_x = (W - logo_w) // 2
|
||||
logo_y = 380
|
||||
|
||||
sh_off = max(1, font_size_logo // 6)
|
||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
|
||||
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
|
||||
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
|
||||
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||
|
||||
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
||||
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||
sub_y = logo_y + logo_h + 48
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||
|
||||
img = img.convert('RGB')
|
||||
|
||||
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
||||
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
||||
PYEOF
|
||||
|
||||
echo "=== wallpaper done ==="
|
||||
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/bin/sh
|
||||
# 9011-toram-rsync.hook.chroot
|
||||
#
|
||||
# Adds rsync to the initramfs so that live-boot's toram code takes the
|
||||
# rsync --progress path instead of the silent "cp -a" fallback.
|
||||
#
|
||||
# live-boot's 9990-toram-todisk.sh already contains:
|
||||
# if [ -x /bin/rsync ]; then
|
||||
# rsync -a --progress ... 1>/dev/console
|
||||
# else
|
||||
# cp -a ... # no output
|
||||
# fi
|
||||
#
|
||||
# We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
|
||||
# which copies the binary + all shared-library dependencies into the initrd.
|
||||
|
||||
set -e
|
||||
|
||||
HOOK_DIR="/etc/initramfs-tools/hooks"
|
||||
HOOK="${HOOK_DIR}/bee-rsync"
|
||||
|
||||
mkdir -p "${HOOK_DIR}"
|
||||
|
||||
cat > "${HOOK}" << 'EOF'
|
||||
#!/bin/sh
|
||||
# initramfs hook: include rsync for live-boot toram progress output
|
||||
PREREQ=""
|
||||
prereqs() { echo "$PREREQ"; }
|
||||
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||
|
||||
. /usr/share/initramfs-tools/hook-functions
|
||||
|
||||
if [ -x /usr/bin/rsync ]; then
|
||||
copy_exec /usr/bin/rsync /bin
|
||||
fi
|
||||
EOF
|
||||
|
||||
chmod +x "${HOOK}"
|
||||
|
||||
echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
|
||||
|
||||
# Rebuild initramfs so the hook takes effect in the ISO's initrd.img
|
||||
KVER=$(ls /lib/modules | sort -V | tail -1)
|
||||
echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
|
||||
update-initramfs -u -k "${KVER}"
|
||||
echo "9011-toram-rsync: done"
|
||||
@@ -3,6 +3,7 @@ dmidecode
|
||||
smartmontools
|
||||
nvme-cli
|
||||
pciutils
|
||||
rsync
|
||||
ipmitool
|
||||
util-linux
|
||||
e2fsprogs
|
||||
|
||||
@@ -65,6 +65,9 @@ done
|
||||
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
echo "ERROR: squashfs not found at $SQUASHFS" >&2
|
||||
echo " The live medium may have been disconnected." >&2
|
||||
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
|
||||
echo " Then re-run bee-install." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -162,10 +165,59 @@ log " Mounted."
|
||||
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
||||
log " Source: $SQUASHFS"
|
||||
log " Target: $MOUNT_ROOT"
|
||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
||||
grep -E '^\[|^inod|^created|^extract' | \
|
||||
while read -r line; do log " $line"; done || true
|
||||
log " Unpack complete."
|
||||
|
||||
# unsquashfs does not support resume, so retry the entire unpack step if the
|
||||
# source medium disappears mid-copy (e.g. CD physically disconnected).
|
||||
UNPACK_ATTEMPTS=0
|
||||
UNPACK_MAX=5
|
||||
while true; do
|
||||
UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
|
||||
if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
|
||||
die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
|
||||
fi
|
||||
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
|
||||
|
||||
# Re-check squashfs is reachable before each attempt
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
log " SOURCE LOST: $SQUASHFS not found."
|
||||
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
|
||||
log " then press Enter here to retry."
|
||||
read -r _
|
||||
continue
|
||||
fi
|
||||
|
||||
# wipe partial unpack so unsquashfs starts clean
|
||||
if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
|
||||
log " Cleaning partial unpack from $MOUNT_ROOT ..."
|
||||
# keep the mount point itself but remove its contents
|
||||
find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
|
||||
fi
|
||||
|
||||
UNPACK_OK=0
|
||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
||||
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
||||
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
||||
|
||||
# Check squashfs is still reachable (gone = disc pulled during copy)
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
log " WARNING: source medium lost during unpack — will retry after remount."
|
||||
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
|
||||
read -r _
|
||||
continue
|
||||
fi
|
||||
|
||||
# Verify the unpack produced a usable root (presence of /etc is a basic check)
|
||||
if [ -d "${MOUNT_ROOT}/etc" ]; then
|
||||
log " Unpack complete."
|
||||
break
|
||||
else
|
||||
log " WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
|
||||
if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
|
||||
log " Retrying in 5 s ..."
|
||||
sleep 5
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
log "--- Step 6/7: Configuring installed system ---"
|
||||
|
||||
100
iso/overlay/usr/local/bin/bee-remount-medium
Normal file
100
iso/overlay/usr/local/bin/bee-remount-medium
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/bin/bash
|
||||
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
|
||||
#
|
||||
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
|
||||
# was lost and /run/live/medium/live/filesystem.squashfs is missing.
|
||||
#
|
||||
# Usage: bee-remount-medium [--wait]
|
||||
# --wait keep retrying every 5 seconds until the medium is found (useful
|
||||
# while physically reconnecting the device)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MEDIUM_DIR="/run/live/medium"
|
||||
SQUASHFS_REL="live/filesystem.squashfs"
|
||||
WAIT_MODE=0
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--wait|-w) WAIT_MODE=1 ;;
|
||||
--help|-h)
|
||||
echo "Usage: bee-remount-medium [--wait]"
|
||||
echo " Finds and remounts the live ISO medium to $MEDIUM_DIR"
|
||||
echo " --wait retry every 5 s until a medium with squashfs is found"
|
||||
exit 0 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
die() { log "ERROR: $*" >&2; exit 1; }
|
||||
|
||||
# Return all candidate block devices (optical + removable USB mass storage)
|
||||
find_candidates() {
|
||||
# CD/DVD drives
|
||||
for dev in /dev/sr* /dev/scd*; do
|
||||
[ -b "$dev" ] && echo "$dev"
|
||||
done
|
||||
# USB/removable disks and partitions
|
||||
for dev in /dev/sd* /dev/vd*; do
|
||||
[ -b "$dev" ] || continue
|
||||
# Only whole disks or partitions — skip the same device we are running from
|
||||
local removable
|
||||
local base
|
||||
base=$(basename "$dev")
|
||||
removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
|
||||
[ "$removable" = "1" ] && echo "$dev"
|
||||
done
|
||||
}
|
||||
|
||||
# Try to mount $1 to $MEDIUM_DIR and check for squashfs
|
||||
try_mount() {
|
||||
local dev="$1"
|
||||
local tmpdir
|
||||
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
|
||||
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
|
||||
if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
|
||||
# Unmount probe mount and mount properly onto live path
|
||||
umount "$tmpdir" 2>/dev/null || true
|
||||
rmdir "$tmpdir" 2>/dev/null || true
|
||||
# Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
|
||||
umount "$MEDIUM_DIR" 2>/dev/null || true
|
||||
mkdir -p "$MEDIUM_DIR"
|
||||
if mount -o ro "$dev" "$MEDIUM_DIR"; then
|
||||
log "Mounted $dev on $MEDIUM_DIR"
|
||||
return 0
|
||||
else
|
||||
log "Mount of $dev on $MEDIUM_DIR failed"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
umount "$tmpdir" 2>/dev/null || true
|
||||
fi
|
||||
rmdir "$tmpdir" 2>/dev/null || true
|
||||
return 1
|
||||
}
|
||||
|
||||
attempt() {
|
||||
log "Scanning for ISO medium..."
|
||||
for dev in $(find_candidates); do
|
||||
log " Trying $dev ..."
|
||||
if try_mount "$dev"; then
|
||||
local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
|
||||
log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
if [ "$WAIT_MODE" = "1" ]; then
|
||||
log "Waiting for live medium (press Ctrl+C to abort)..."
|
||||
while true; do
|
||||
if attempt; then
|
||||
exit 0
|
||||
fi
|
||||
log " Not found — retrying in 5 s (reconnect the disc now)"
|
||||
sleep 5
|
||||
done
|
||||
else
|
||||
attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
|
||||
fi
|
||||
Reference in New Issue
Block a user