Compare commits

...

5 Commits
v7.21 ... v8.1

Author SHA1 Message Date
a636146dbd Fix power calibration failing due to DCGM resource contention
When a targeted_power attempt is cancelled (e.g. after sw_thermal
throttle), nv-hostengine holds the diagnostic slot asynchronously.
The next attempt immediately received DCGM_ST_IN_USE (exit 222)
and incorrectly derated the power limit.

Now: exit 222 is detected via isDCGMResourceBusy and triggers an
exponential back-off retry at the same power limit (1s, 2s, 4s, …
up to 256s). Once the back-off delay would exceed 300s the
calibration fails, indicating the slot is persistently held.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 20:41:17 +03:00
Mikhail Chusavitin
303de2df04 Add slot-aware ramp sequence to bee-bench power 2026-04-14 17:47:40 +03:00
Mikhail Chusavitin
95124d228f Split bee-bench into perf and power workflows 2026-04-14 17:33:13 +03:00
Mikhail Chusavitin
54338dbae5 Unify live RAM runtime state 2026-04-14 16:18:33 +03:00
Mikhail Chusavitin
2be7ae6d28 Refine NVIDIA benchmark phase timing 2026-04-14 14:12:06 +03:00
24 changed files with 1827 additions and 449 deletions

View File

@@ -30,7 +30,9 @@ var (
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
DefaultTechDumpDir = DefaultExportDir + "/techdump"
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
)
type App struct {
@@ -84,6 +86,7 @@ type installer interface {
InstallToDisk(ctx context.Context, device string, logFile string) error
IsLiveMediaInRAM() bool
LiveBootSource() platform.LiveBootSource
LiveMediaRAMState() platform.LiveMediaRAMState
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
}
@@ -108,6 +111,10 @@ func (a *App) LiveBootSource() platform.LiveBootSource {
return a.installer.LiveBootSource()
}
func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
return a.installer.LiveMediaRAMState()
}
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
return a.installer.RunInstallToRAM(ctx, logFunc)
}
@@ -117,6 +124,7 @@ type satRunner interface {
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -562,11 +570,18 @@ func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOp
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBenchmarkBaseDir
baseDir = DefaultBeeBenchPerfDir
}
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchPowerDir
}
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir

View File

@@ -122,6 +122,7 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
type fakeSAT struct {
runNvidiaFn func(string) (string, error)
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runNvidiaComputeFn func(string, int, []int) (string, error)
runNvidiaPowerFn func(string, int, []int) (string, error)
@@ -154,6 +155,13 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
if f.runNvidiaPowerBenchFn != nil {
return f.runNvidiaPowerBenchFn(baseDir, opts)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaTargetedStressFn != nil {
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)

File diff suppressed because it is too large Load Diff

View File

@@ -48,7 +48,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
}
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
if result.RampStep > 0 && result.RampTotal > 0 {
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
@@ -83,15 +83,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
// ── Methodology ───────────────────────────────────────────────────────────
b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
b.WriteString("**Compute score** is derived from two phases:\n\n")
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
@@ -170,6 +170,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
if gpu.PowerLimitW > 0 {
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
}
if gpu.PowerLimitDerated {
fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
}
if gpu.CalibratedPeakPowerW > 0 {
if gpu.CalibratedPeakTempC > 0 {
fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
} else {
fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
}
}
if gpu.LockedGraphicsClockMHz > 0 {
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
}
@@ -188,7 +198,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
// Per-precision stability phases.
if len(gpu.PrecisionSteady) > 0 {
b.WriteString("**Per-precision stability:**\n\n")
b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n")
b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
for _, p := range gpu.PrecisionSteady {
eccCorr := "—"
eccUncorr := "—"
@@ -196,8 +206,12 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
}
fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
status := p.Status
if strings.TrimSpace(status) == "" {
status = "OK"
}
fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
eccCorr, eccUncorr)
}
b.WriteString("\n")
@@ -364,6 +378,7 @@ func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64)
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))

View File

@@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
{
name: "default",
profile: "",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
},
{
name: "stability",
profile: "stability",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
},
{
name: "overnight",
profile: "overnight",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
},
}
@@ -41,6 +41,129 @@ func TestResolveBenchmarkProfile(t *testing.T) {
}
}
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
t.Parallel()
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
benchmarkPrecisionPhases,
func(label string) string { return label },
)
if len(labels) != 7 || len(phases) != 7 {
t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
}
if basePhaseSec != 60 {
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
}
if mixedPhaseSec != 300 {
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
}
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
}
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
t.Parallel()
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
benchmarkPrecisionPhases,
func(label string) string { return label },
)
if basePhaseSec != 300 {
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
}
if mixedPhaseSec != 3600 {
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
t.Parallel()
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
benchmarkPrecisionPhases,
func(label string) string { return label },
)
if basePhaseSec != 3600 {
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
}
if mixedPhaseSec != 14400 {
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
t.Parallel()
phases := []benchmarkPlannedPhase{
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
}
rows := []GPUMetricRow{
{ElapsedSec: 5},
{ElapsedSec: 15},
{ElapsedSec: 25},
{ElapsedSec: 65},
}
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
if len(got["fp8"]) != 1 {
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
}
if len(got["fp16"]) != 1 {
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
}
if len(got["mixed"]) != 2 {
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
}
}
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
t.Parallel()
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
t.Fatalf("supported=%v", got)
}
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
t.Fatalf("supported=%v", got)
}
}
func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
t.Parallel()
cases := []struct {
name string
raw string
wantStatus string
}{
{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
if got != tc.wantStatus {
t.Fatalf("status=%q want %q", got, tc.wantStatus)
}
})
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel()
@@ -65,8 +188,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
"[gpu 0] compute_capability=9.0",
"[gpu 0] backend=cublasLt",
"[gpu 0] duration_s=10",
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
"[gpu 0] int8_tensor_iterations=80",
"[gpu 0] fp16_tensor_iterations=200",
"[gpu 0] fp8_e4m3_iterations=50",
"[gpu 0] status=OK",
@@ -79,15 +204,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
if got.ComputeCapability != "9.0" {
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
}
if len(got.Profiles) != 2 {
t.Fatalf("profiles=%d want 2", len(got.Profiles))
if len(got.Profiles) != 3 {
t.Fatalf("profiles=%d want 3", len(got.Profiles))
}
if got.Profiles[0].TeraOpsPerSec <= 0 {
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
}
if got.Profiles[0].Category != "fp16_bf16" {
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
}
if got.Profiles[1].Category != "fp8" {
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
}
if got.Profiles[2].Category != "int8" {
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
}
if got.Profiles[2].Weight != 0.25 {
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
}
}
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {

View File

@@ -104,6 +104,7 @@ type BenchmarkGPUResult struct {
Backend string `json:"backend,omitempty"`
Status string `json:"status"`
PowerLimitW float64 `json:"power_limit_w,omitempty"`
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// CalibratedPeakPowerW is the p95 power measured during a short
@@ -111,6 +112,8 @@ type BenchmarkGPUResult struct {
// Used as the reference denominator for PowerSustainScore instead of
// the hardware default limit, which bee-gpu-burn cannot reach.
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
CalibratedPeakTempC float64 `json:"calibrated_peak_temp_c,omitempty"`
PowerCalibrationTries int `json:"power_calibration_tries,omitempty"`
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
@@ -119,6 +122,7 @@ type BenchmarkGPUResult struct {
Baseline BenchmarkTelemetrySummary `json:"baseline"`
Steady BenchmarkTelemetrySummary `json:"steady"`
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
PrecisionFailures []string `json:"precision_failures,omitempty"`
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
// ECC error delta accumulated over the full benchmark (all phases combined).
@@ -179,7 +183,7 @@ type BenchmarkPrecisionResult struct {
Iterations uint64 `json:"iterations,omitempty"`
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
// Weight is the fp32-equivalence factor for this precision category.
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
Weight float64 `json:"weight,omitempty"`
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
@@ -225,13 +229,15 @@ type BenchmarkServerPower struct {
// type runs at a time the PowerCVPct here is a genuine stability signal.
type BenchmarkPrecisionSteadyPhase struct {
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
Status string `json:"status,omitempty"`
Steady BenchmarkTelemetrySummary `json:"steady"`
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
// ECC errors accumulated during this precision phase only.
// Non-zero corrected = stress-induced DRAM errors for this kernel type.
// Any uncorrected = serious fault triggered by this precision workload.
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
Notes string `json:"notes,omitempty"`
}
type BenchmarkInterconnectResult struct {
@@ -245,3 +251,45 @@ type BenchmarkInterconnectResult struct {
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type NvidiaPowerBenchResult struct {
BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"`
ServerModel string `json:"server_model,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"`
SelectedGPUIndices []int `json:"selected_gpu_indices"`
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
OverallStatus string `json:"overall_status"`
Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
}
type NvidiaPowerBenchGPU struct {
Index int `json:"index"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
Derated bool `json:"derated,omitempty"`
Status string `json:"status"`
OccupiedSlots []int `json:"occupied_slots,omitempty"`
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type NvidiaPowerBenchStep struct {
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
MinPowerRealizationPct float64 `json:"min_power_realization_pct,omitempty"`
AvgPowerRealizationPct float64 `json:"avg_power_realization_pct,omitempty"`
DeratedGPUCount int `json:"derated_gpu_count,omitempty"`
Status string `json:"status"`
Notes []string `json:"notes,omitempty"`
}

View File

@@ -14,6 +14,8 @@ import (
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
type GPUMetricRow struct {
Stage string `json:"stage,omitempty"`
StageStartSec float64 `json:"stage_start_sec,omitempty"`
StageEndSec float64 `json:"stage_end_sec,omitempty"`
ElapsedSec float64 `json:"elapsed_sec"`
GPUIndex int `json:"index"`
TempC float64 `json:"temp_c"`
@@ -509,11 +511,22 @@ func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
if name == "" {
name = "run"
}
start := row.StageStartSec
end := row.StageEndSec
if end <= start {
start = row.ElapsedSec
end = row.ElapsedSec
}
if len(spans) == 0 || spans[len(spans)-1].Name != name {
spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec})
spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
continue
}
spans[len(spans)-1].End = row.ElapsedSec
if start < spans[len(spans)-1].Start {
spans[len(spans)-1].Start = start
}
if end > spans[len(spans)-1].End {
spans[len(spans)-1].End = end
}
}
for i := range spans {
if spans[i].End <= spans[i].Start {

View File

@@ -11,20 +11,10 @@ import (
"strings"
)
const installToRAMDir = "/dev/shm/bee-live"
func (s *System) IsLiveMediaInRAM() bool {
fsType := mountFSType("/run/live/medium")
if fsType == "" {
// No medium mount at all — fall back to toram kernel parameter.
return toramActive()
}
if strings.EqualFold(fsType, "tmpfs") {
return true
}
// When RunInstallToRAM copies squashfs to /dev/shm/bee-live but the bind
// mount of /run/live/medium fails (common for CD-ROM boots), the medium
// fstype still shows the CD-ROM type. Check whether the RAM copy exists.
files, _ := filepath.Glob("/dev/shm/bee-live/*.squashfs")
return len(files) > 0
return s.LiveMediaRAMState().InRAM
}
func (s *System) LiveBootSource() LiveBootSource {
@@ -56,14 +46,95 @@ func (s *System) LiveBootSource() LiveBootSource {
return status
}
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
func (s *System) LiveMediaRAMState() LiveMediaRAMState {
return evaluateLiveMediaRAMState(
s.LiveBootSource(),
toramActive(),
globPaths("/run/live/medium/live/*.squashfs"),
globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
)
}
func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
state := LiveMediaRAMState{
LiveBootSource: status,
ToramActive: toram,
CopyPresent: len(copiedSquashfs) > 0,
}
if status.InRAM {
state.State = "in_ram"
state.Status = "ok"
state.CopyComplete = true
state.Message = "Running from RAM — installation media can be safely disconnected."
return state
}
expected := pathBaseSet(sourceSquashfs)
copied := pathBaseSet(copiedSquashfs)
state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
switch {
case state.CopyComplete:
state.State = "partial"
state.Status = "partial"
state.CanStartCopy = true
state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
case state.CopyPresent:
state.State = "partial"
state.Status = "partial"
state.CanStartCopy = true
state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
case toram:
state.State = "toram_failed"
state.Status = "failed"
state.CanStartCopy = true
state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
default:
state.State = "not_in_ram"
state.Status = "warning"
state.CanStartCopy = true
state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
}
return state
}
func globPaths(pattern string) []string {
matches, _ := filepath.Glob(pattern)
return matches
}
func pathBaseSet(paths []string) map[string]struct{} {
out := make(map[string]struct{}, len(paths))
for _, path := range paths {
base := strings.TrimSpace(filepath.Base(path))
if base != "" {
out[base] = struct{}{}
}
}
return out
}
func setContainsAll(have, want map[string]struct{}) bool {
if len(want) == 0 {
return false
}
for name := range want {
if _, ok := have[name]; !ok {
return false
}
}
return true
}
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
log := func(msg string) {
if logFunc != nil {
logFunc(msg)
}
}
if s.IsLiveMediaInRAM() {
state := s.LiveMediaRAMState()
if state.InRAM {
log("Already running from RAM — installation media can be safely disconnected.")
return nil
}
@@ -88,10 +159,21 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
humanBytes(needed+headroom), humanBytes(free))
}
dstDir := "/dev/shm/bee-live"
dstDir := installToRAMDir
if state.CopyPresent {
log("Removing stale partial RAM copy before retry...")
}
_ = os.RemoveAll(dstDir)
if err := os.MkdirAll(dstDir, 0755); err != nil {
return fmt.Errorf("create tmpfs dir: %v", err)
}
defer func() {
if retErr == nil {
return
}
_ = os.RemoveAll(dstDir)
log("Removed incomplete RAM copy.")
}()
for _, sf := range squashfsFiles {
if err := ctx.Err(); err != nil {

View File

@@ -58,3 +58,46 @@ func TestDescribeLiveBootSource(t *testing.T) {
t.Fatalf("got %q want /run/live/medium", got)
}
}
func TestEvaluateLiveMediaRAMState(t *testing.T) {
t.Parallel()
t.Run("in_ram", func(t *testing.T) {
state := evaluateLiveMediaRAMState(
LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
false,
nil,
nil,
)
if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
t.Fatalf("state=%+v", state)
}
})
t.Run("partial_copy_after_cancel", func(t *testing.T) {
state := evaluateLiveMediaRAMState(
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
false,
[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
[]string{"/dev/shm/bee-live/filesystem.squashfs"},
)
if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
t.Fatalf("state=%+v", state)
}
if state.CopyComplete {
t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
}
})
t.Run("toram_failed", func(t *testing.T) {
state := evaluateLiveMediaRAMState(
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
true,
nil,
nil,
)
if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
t.Fatalf("state=%+v", state)
}
})
}

View File

@@ -171,25 +171,28 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
return ToolStatus{Name: display}
}
// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
// collectToRAMHealth evaluates whether the live system is fully running from RAM.
// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
// incomplete RAM copy exists but runtime still depends on the boot medium,
// "failed" = toram was requested but medium is not in RAM.
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
inRAM := s.IsLiveMediaInRAM()
active := toramActive()
switch {
case inRAM:
health.ToRAMStatus = "ok"
case active:
// toram was requested but medium is not yet/no longer in RAM
health.ToRAMStatus = "failed"
state := s.LiveMediaRAMState()
health.ToRAMStatus = state.Status
switch state.Status {
case "ok":
return
case "failed":
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "toram_copy_failed",
Severity: "warning",
Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
Description: state.Message,
})
case "partial":
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "toram_copy_partial",
Severity: "warning",
Description: state.Message,
})
default:
health.ToRAMStatus = "warning"
}
}
@@ -211,13 +214,13 @@ func findUSBExportMount() string {
// fs types that are expected on USB export drives
exportFSTypes := map[string]bool{
"vfat": true,
"exfat": true,
"ext2": true,
"ext3": true,
"ext4": true,
"ntfs": true,
"ntfs3": true,
"vfat": true,
"exfat": true,
"ext2": true,
"ext3": true,
"ext4": true,
"ntfs": true,
"ntfs3": true,
"fuseblk": true,
}

View File

@@ -9,6 +9,17 @@ type LiveBootSource struct {
Device string `json:"device,omitempty"`
}
type LiveMediaRAMState struct {
LiveBootSource
State string `json:"state"`
Status string `json:"status"`
ToramActive bool `json:"toram_active,omitempty"`
CopyPresent bool `json:"copy_present,omitempty"`
CopyComplete bool `json:"copy_complete,omitempty"`
CanStartCopy bool `json:"can_start_copy,omitempty"`
Message string `json:"message,omitempty"`
}
type InterfaceInfo struct {
Name string
State string

View File

@@ -15,17 +15,17 @@ type HardwareIngestRequest struct {
}
type RuntimeHealth struct {
Status string `json:"status"`
CheckedAt string `json:"checked_at"`
ExportDir string `json:"export_dir,omitempty"`
DriverReady bool `json:"driver_ready,omitempty"`
CUDAReady bool `json:"cuda_ready,omitempty"`
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
NetworkStatus string `json:"network_status,omitempty"`
// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
ToRAMStatus string `json:"toram_status,omitempty"`
Status string `json:"status"`
CheckedAt string `json:"checked_at"`
ExportDir string `json:"export_dir,omitempty"`
DriverReady bool `json:"driver_ready,omitempty"`
CUDAReady bool `json:"cuda_ready,omitempty"`
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
NetworkStatus string `json:"network_status,omitempty"`
// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
ToRAMStatus string `json:"toram_status,omitempty"`
// USBExportPath: mount point of the first writable USB drive found, empty if none.
USBExportPath string `json:"usb_export_path,omitempty"`
USBExportPath string `json:"usb_export_path,omitempty"`
Issues []RuntimeIssue `json:"issues,omitempty"`
Tools []RuntimeToolStatus `json:"tools,omitempty"`
Services []RuntimeServiceStatus `json:"services,omitempty"`

View File

@@ -110,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
switch strings.TrimSpace(target) {
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute",
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
"nvidia-bandwidth", "nvidia-stress":
return true
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
return taskPriorityInstallToRAM
case "audit":
return taskPriorityAudit
case "nvidia-benchmark":
case "nvidia-bench-perf", "nvidia-bench-power":
return taskPriorityBenchmark
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
return taskPriorityBurn
@@ -526,14 +526,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
return
}
var body struct {
Duration int `json:"duration"`
StressMode bool `json:"stress_mode"`
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
StaggerGPUStart bool `json:"stagger_gpu_start"`
ParallelGPUs bool `json:"parallel_gpus"`
Loader string `json:"loader"`
var body struct {
Duration int `json:"duration"`
StressMode bool `json:"stress_mode"`
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
StaggerGPUStart bool `json:"stagger_gpu_start"`
ParallelGPUs bool `json:"parallel_gpus"`
Loader string `json:"loader"`
Profile string `json:"profile"`
DisplayName string `json:"display_name"`
PlatformComponents []string `json:"platform_components"`
@@ -549,14 +549,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
if strings.TrimSpace(body.DisplayName) != "" {
name = body.DisplayName
}
params := taskParams{
Duration: body.Duration,
StressMode: body.StressMode,
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
StaggerGPUStart: body.StaggerGPUStart,
ParallelGPUs: body.ParallelGPUs,
Loader: body.Loader,
params := taskParams{
Duration: body.Duration,
StressMode: body.StressMode,
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
StaggerGPUStart: body.StaggerGPUStart,
ParallelGPUs: body.ParallelGPUs,
Loader: body.Loader,
BurnProfile: body.Profile,
DisplayName: body.DisplayName,
PlatformComponents: body.PlatformComponents,
@@ -573,131 +573,142 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
}
}
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
var body struct {
Profile string `json:"profile"`
SizeMB int `json:"size_mb"`
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
RunNCCL *bool `json:"run_nccl"`
ParallelGPUs *bool `json:"parallel_gpus"`
RampUp *bool `json:"ramp_up"`
DisplayName string `json:"display_name"`
}
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeError(w, http.StatusBadRequest, "invalid request body")
func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
}
runNCCL := true
if body.RunNCCL != nil {
runNCCL = *body.RunNCCL
}
parallelGPUs := false
if body.ParallelGPUs != nil {
parallelGPUs = *body.ParallelGPUs
}
rampUp := false
if body.RampUp != nil {
rampUp = *body.RampUp
}
// Build a descriptive base name that includes profile and mode so the task
// list is self-explanatory without opening individual task detail pages.
profile := strings.TrimSpace(body.Profile)
if profile == "" {
profile = "standard"
}
name := taskDisplayName("nvidia-benchmark", "", "")
if strings.TrimSpace(body.DisplayName) != "" {
name = body.DisplayName
}
// Append profile tag.
name = fmt.Sprintf("%s · %s", name, profile)
var body struct {
Profile string `json:"profile"`
SizeMB int `json:"size_mb"`
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
RunNCCL *bool `json:"run_nccl"`
ParallelGPUs *bool `json:"parallel_gpus"`
RampUp *bool `json:"ramp_up"`
DisplayName string `json:"display_name"`
}
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
}
if rampUp && len(body.GPUIndices) > 1 {
// Ramp-up mode: resolve GPU list, then create one task per prefix
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
gpus, err := apiListNvidiaGPUs(h.opts.App)
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
runNCCL := true
if body.RunNCCL != nil {
runNCCL = *body.RunNCCL
}
parallelGPUs := false
if body.ParallelGPUs != nil {
parallelGPUs = *body.ParallelGPUs
}
rampUp := false
if body.RampUp != nil {
rampUp = *body.RampUp
}
// Build a descriptive base name that includes profile and mode so the task
// list is self-explanatory without opening individual task detail pages.
profile := strings.TrimSpace(body.Profile)
if profile == "" {
profile = "standard"
}
name := taskDisplayName(target, "", "")
if strings.TrimSpace(body.DisplayName) != "" {
name = body.DisplayName
}
// Append profile tag.
name = fmt.Sprintf("%s · %s", name, profile)
if target == "nvidia-bench-power" && parallelGPUs {
writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only")
return
}
resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
return
}
if len(resolved) < 2 {
// Fall through to normal single-task path.
rampUp = false
} else {
now := time.Now()
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
var allTasks []*Task
for step := 1; step <= len(resolved); step++ {
subset := resolved[:step]
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
t := &Task{
ID: newJobID("benchmark-nvidia"),
Name: stepName,
Target: "nvidia-benchmark",
Priority: defaultTaskPriority("nvidia-benchmark", taskParams{}),
Status: TaskPending,
CreatedAt: now,
params: taskParams{
GPUIndices: append([]int(nil), subset...),
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL && step == len(resolved),
ParallelGPUs: true,
RampStep: step,
RampTotal: len(resolved),
RampRunID: rampRunID,
DisplayName: stepName,
},
if rampUp && len(body.GPUIndices) > 1 {
// Ramp-up mode: resolve GPU list, then create one task per prefix
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
gpus, err := apiListNvidiaGPUs(h.opts.App)
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
return
}
resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
return
}
if len(resolved) < 2 {
// Fall through to normal single-task path.
rampUp = false
} else {
now := time.Now()
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
var allTasks []*Task
for step := 1; step <= len(resolved); step++ {
subset := resolved[:step]
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
t := &Task{
ID: newJobID("bee-bench-nvidia"),
Name: stepName,
Target: target,
Priority: defaultTaskPriority(target, taskParams{}),
Status: TaskPending,
CreatedAt: now,
params: taskParams{
GPUIndices: append([]int(nil), subset...),
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL && step == len(resolved),
ParallelGPUs: true,
RampStep: step,
RampTotal: len(resolved),
RampRunID: rampRunID,
DisplayName: stepName,
},
}
allTasks = append(allTasks, t)
}
allTasks = append(allTasks, t)
for _, t := range allTasks {
globalQueue.enqueue(t)
}
writeTaskRunResponse(w, allTasks)
return
}
for _, t := range allTasks {
globalQueue.enqueue(t)
}
writeTaskRunResponse(w, allTasks)
}
// For non-ramp tasks append mode tag.
if parallelGPUs {
name = fmt.Sprintf("%s · parallel", name)
} else {
name = fmt.Sprintf("%s · sequential", name)
}
params := taskParams{
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL,
ParallelGPUs: parallelGPUs,
DisplayName: body.DisplayName,
}
tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia")
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
return
}
for _, t := range tasks {
globalQueue.enqueue(t)
}
writeTaskRunResponse(w, tasks)
}
}
// For non-ramp tasks append mode tag.
if parallelGPUs {
name = fmt.Sprintf("%s · parallel", name)
} else {
name = fmt.Sprintf("%s · sequential", name)
}
params := taskParams{
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL,
ParallelGPUs: parallelGPUs,
DisplayName: body.DisplayName,
}
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", defaultTaskPriority("nvidia-benchmark", params), time.Now(), params, name, h.opts.App, "benchmark-nvidia")
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
return
}
for _, t := range tasks {
globalQueue.enqueue(t)
}
writeTaskRunResponse(w, tasks)
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
}
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
@@ -1072,18 +1083,55 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
status := h.opts.App.LiveBootSource()
status := h.currentRAMStatus()
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(status)
}
type ramStatusResponse struct {
platform.LiveMediaRAMState
InstallTaskActive bool `json:"install_task_active,omitempty"`
CopyTaskActive bool `json:"copy_task_active,omitempty"`
CanStartTask bool `json:"can_start_task,omitempty"`
BlockedReason string `json:"blocked_reason,omitempty"`
}
func (h *handler) currentRAMStatus() ramStatusResponse {
state := h.opts.App.LiveMediaRAMState()
resp := ramStatusResponse{LiveMediaRAMState: state}
if globalQueue.hasActiveTarget("install") {
resp.InstallTaskActive = true
resp.BlockedReason = "install to disk is already running"
return resp
}
if globalQueue.hasActiveTarget("install-to-ram") {
resp.CopyTaskActive = true
resp.BlockedReason = "install to RAM task is already pending or running"
return resp
}
if state.InRAM {
resp.BlockedReason = "system is already running from RAM"
return resp
}
resp.CanStartTask = state.CanStartCopy
if !resp.CanStartTask && resp.BlockedReason == "" {
resp.BlockedReason = state.Message
}
return resp
}
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
if globalQueue.hasActiveTarget("install") {
writeError(w, http.StatusConflict, "install to disk is already running")
status := h.currentRAMStatus()
if !status.CanStartTask {
msg := strings.TrimSpace(status.BlockedReason)
if msg == "" {
msg = "install to RAM is not available"
}
writeError(w, http.StatusConflict, msg)
return
}
t := &Task{

View File

@@ -64,7 +64,7 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkNvidiaRun(rec, req)
@@ -78,8 +78,8 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
}
task := globalQueue.tasks[0]
if task.Target != "nvidia-benchmark" {
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
if task.Target != "nvidia-bench-perf" {
t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
}
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
t.Fatalf("gpu indices=%v want [1 3]", got)
@@ -113,7 +113,7 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkNvidiaRun(rec, req)
@@ -147,6 +147,50 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
}
}
func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
prevList := apiListNvidiaGPUs
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
return []platform.NvidiaGPU{
{Index: 0, Name: "NVIDIA H100 PCIe"},
{Index: 1, Name: "NVIDIA H100 PCIe"},
{Index: 2, Name: "NVIDIA H100 PCIe"},
}, nil
}
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 3 {
t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
}
for i, task := range globalQueue.tasks {
if task.Target != "nvidia-bench-power" {
t.Fatalf("task[%d] target=%q", i, task.Target)
}
if task.Priority != taskPriorityBenchmark {
t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
}
}
}
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
@@ -202,7 +246,8 @@ func TestDefaultTaskPriorityOrder(t *testing.T) {
defaultTaskPriority("cpu", taskParams{}),
defaultTaskPriority("cpu", taskParams{StressMode: true}),
defaultTaskPriority("nvidia-stress", taskParams{}),
defaultTaskPriority("nvidia-benchmark", taskParams{}),
defaultTaskPriority("nvidia-bench-perf", taskParams{}),
defaultTaskPriority("nvidia-bench-power", taskParams{}),
}
want := []int{
taskPriorityInstallToRAM,
@@ -211,13 +256,14 @@ func TestDefaultTaskPriorityOrder(t *testing.T) {
taskPriorityValidateStress,
taskPriorityBurn,
taskPriorityBenchmark,
taskPriorityBenchmark,
}
for i := range want {
if got[i] != want[i] {
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
}
}
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5]) {
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
t.Fatalf("priority order=%v", got)
}
}

View File

@@ -232,7 +232,7 @@ func truncate(s string, max int) string {
// isSATTarget returns true for task targets that run hardware acceptance tests.
func isSATTarget(target string) bool {
switch target {
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
"platform-stress":

View File

@@ -845,6 +845,13 @@ func buildRuntimeToRAMRow(health schema.RuntimeHealth) runtimeHealthRow {
Source: "live-boot / /proc/mounts",
Issue: "",
}
case "partial":
return runtimeHealthRow{
Title: "LiveCD in RAM",
Status: "WARNING",
Source: "live-boot / /proc/mounts / /dev/shm/bee-live",
Issue: "Partial or staged RAM copy detected. System is not fully running from RAM; Copy to RAM can be retried.",
}
case "failed":
return runtimeHealthRow{
Title: "LiveCD in RAM",
@@ -1939,7 +1946,7 @@ func renderBenchmark(opts HandlerOptions) string {
<div class="grid2">
<div class="card">
<div class="card-head">NVIDIA Benchmark</div>
<div class="card-head">Benchmark Setup</div>
<div class="card-body">
<div class="form-row">
<label>Profile</label>
@@ -1972,21 +1979,25 @@ func renderBenchmark(opts HandlerOptions) string {
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
</label>
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
<button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>&#9654; Run Benchmark</button>
<div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
</div>
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
</div>
</div>
<div class="card">
<div class="card-head">Method</div>
<div class="card-head">Method Split</div>
<div class="card-body">
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">Each benchmark run performs warmup, sustained compute, telemetry capture, cooldown, and optional NCCL interconnect checks.</p>
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
<table>
<tr><th>Profile</th><th>Purpose</th></tr>
<tr><td>Standard</td><td>Fast, repeatable performance check for server-to-server comparison.</td></tr>
<tr><td>Stability</td><td>Longer run for thermal drift, power caps, and clock instability.</td></tr>
<tr><td>Overnight</td><td>Extended verification of long-run stability and late throttling.</td></tr>
<tr><th>Run Type</th><th>Engine</th><th>Question</th></tr>
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td></tr>
<tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td></tr>
</table>
<p style="font-size:12px;color:var(--muted);margin-top:10px">Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
</div>
</div>
</div>
@@ -2029,21 +2040,24 @@ function benchmarkMode() {
function benchmarkUpdateSelectionNote() {
const selected = benchmarkSelectedGPUIndices();
const btn = document.getElementById('benchmark-run-btn');
const perfBtn = document.getElementById('benchmark-run-performance-btn');
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
const note = document.getElementById('benchmark-selection-note');
if (!selected.length) {
btn.disabled = true;
perfBtn.disabled = true;
fitBtn.disabled = true;
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
return;
}
btn.disabled = false;
perfBtn.disabled = false;
fitBtn.disabled = false;
const mode = benchmarkMode();
if (mode === 'ramp-up') {
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). NCCL on final step.';
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses targeted_power per step.';
} else if (mode === 'parallel') {
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously.' + (selected.length > 1 ? ' NCCL included.' : '');
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
} else {
note.textContent = 'Sequential: each GPU benchmarked separately.' + (selected.length > 1 ? ' NCCL included on each.' : '');
note.textContent = 'Sequential: each selected GPU benchmarked separately.';
}
}
@@ -2117,7 +2131,7 @@ function benchmarkSelectNone() {
benchmarkUpdateSelectionNote();
}
function runNvidiaBenchmark() {
function runNvidiaBenchmark(kind) {
const selected = benchmarkSelectedGPUIndices();
const status = document.getElementById('benchmark-run-status');
if (!selected.length) {
@@ -2127,21 +2141,26 @@ function runNvidiaBenchmark() {
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
const mode = benchmarkMode();
const rampUp = mode === 'ramp-up' && selected.length > 1;
const parallelGPUs = mode === 'parallel';
const parallelGPUs = mode === 'parallel' && kind === 'performance';
if (kind === 'power-fit' && mode === 'parallel') {
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
return;
}
const body = {
profile: document.getElementById('benchmark-profile').value || 'standard',
gpu_indices: selected,
run_nccl: selected.length > 1,
run_nccl: kind === 'performance' && selected.length > 1,
parallel_gpus: parallelGPUs,
ramp_up: rampUp,
display_name: 'NVIDIA Benchmark'
display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
};
document.getElementById('benchmark-output').style.display = 'block';
document.getElementById('benchmark-title').textContent = '— ' + body.profile + ' [' + selected.join(', ') + ']';
document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
const term = document.getElementById('benchmark-terminal');
term.textContent = 'Enqueuing benchmark for GPUs ' + selected.join(', ') + '...\n';
term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
status.textContent = 'Queueing...';
fetch('/api/benchmark/nvidia/run', {
const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
fetch(endpoint, {
method: 'POST',
headers: {'Content-Type':'application/json'},
body: JSON.stringify(body)
@@ -2195,7 +2214,7 @@ benchmarkLoadGPUs();
func renderBenchmarkResultsCard(exportDir string) string {
maxIdx, runs := loadBenchmarkHistory(exportDir)
return renderBenchmarkResultsCardFromRuns(
"Benchmark Results",
"Perf Results",
"Composite score by saved benchmark run and GPU.",
"No saved benchmark runs yet.",
maxIdx,
@@ -2237,11 +2256,11 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
}
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
baseDir := app.DefaultBenchmarkBaseDir
baseDir := app.DefaultBeeBenchPerfDir
if strings.TrimSpace(exportDir) != "" {
baseDir = filepath.Join(exportDir, "bee-benchmark")
baseDir = filepath.Join(exportDir, "bee-bench", "perf")
}
paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json"))
paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
if err != nil || len(paths) == 0 {
return -1, nil
}
@@ -2280,7 +2299,6 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
return maxGPUIndex, runs
}
// ── Burn ──────────────────────────────────────────────────────────────────────
func renderBurn() string {
@@ -3245,12 +3263,19 @@ fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
else if (kind === 'disk') label = 'disk (' + source + ')';
else label = source;
boot.textContent = 'Current boot source: ' + label + '.';
if (d.in_ram) {
txt.textContent = '✓ Running from RAM — installation media can be safely disconnected.';
txt.textContent = d.message || 'Checking...';
if (d.status === 'ok' || d.in_ram) {
txt.style.color = 'var(--ok, green)';
} else if (d.status === 'failed') {
txt.style.color = 'var(--err, #b91c1c)';
} else {
txt.textContent = 'Live media is mounted from installation device. Copy to RAM to allow media removal.';
txt.style.color = 'var(--muted)';
}
if (d.can_start_task) {
btn.style.display = '';
btn.disabled = false;
} else {
btn.style.display = 'none';
}
});
function installToRAM() {

View File

@@ -261,7 +261,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
// Tasks
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)

View File

@@ -11,6 +11,7 @@ import (
"time"
"bee/audit/internal/platform"
"bee/audit/internal/schema"
)
func TestChartLegendNumber(t *testing.T) {
@@ -78,6 +79,16 @@ func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
}
}
func TestBuildRuntimeToRAMRowShowsPartialCopyWarning(t *testing.T) {
row := buildRuntimeToRAMRow(schema.RuntimeHealth{ToRAMStatus: "partial"})
if row.Status != "WARNING" {
t.Fatalf("status=%q want WARNING", row.Status)
}
if !strings.Contains(row.Issue, "Partial or staged RAM copy detected") {
t.Fatalf("issue=%q", row.Issue)
}
}
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
samples := []platform.LiveMetricSample{
{
@@ -637,8 +648,11 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
`href="/benchmark"`,
`id="benchmark-gpu-list"`,
`/api/gpu/nvidia`,
`/api/benchmark/nvidia/run`,
`/api/bee-bench/nvidia/perf/run`,
`/api/bee-bench/nvidia/power/run`,
`benchmark-run-nccl`,
`Run Performance Benchmark`,
`Run Power / Thermal Fit`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("benchmark page missing %q: %s", needle, body)
@@ -649,7 +663,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
dir := t.TempDir()
exportDir := filepath.Join(dir, "export")
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000")
if err := os.MkdirAll(runDir, 0755); err != nil {
t.Fatal(err)
}
@@ -691,7 +705,7 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
body := rec.Body.String()
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
for _, needle := range []string{
`Benchmark Results`,
`Perf Results`,
`Composite score by saved benchmark run and GPU.`,
`GPU 0`,
`GPU 1`,
@@ -1113,8 +1127,8 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
`>Storage<`,
`>GPU<`,
`>PSU<`,
`badge-warn`, // cpu Warning badge
`badge-err`, // storage Critical badge
`badge-warn`, // cpu Warning badge
`badge-err`, // storage Critical badge
} {
if !strings.Contains(body, needle) {
t.Fatalf("dashboard missing %q: %s", needle, body)

View File

@@ -233,6 +233,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
b.WriteString(benchmarkCard)
}
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
b.WriteString(powerCard)
}
if len(report.Charts) > 0 {
for _, chart := range report.Charts {
@@ -251,7 +254,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
}
func renderTaskBenchmarkResultsCard(target, logText string) string {
if strings.TrimSpace(target) != "nvidia-benchmark" {
switch strings.TrimSpace(target) {
case "nvidia-bench-perf":
default:
return ""
}
resultPath := taskBenchmarkResultPath(logText)
@@ -263,7 +268,7 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
return ""
}
return renderBenchmarkResultsCardFromRuns(
"Benchmark Results",
"Perf Results",
"Composite score for this benchmark task.",
"No benchmark results were saved for this task.",
columns,
@@ -271,15 +276,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
)
}
func renderTaskPowerResultsCard(target, logText string) string {
if strings.TrimSpace(target) != "nvidia-bench-power" {
return ""
}
resultPath := taskBenchmarkResultPath(logText)
if strings.TrimSpace(resultPath) == "" {
return ""
}
raw, err := os.ReadFile(resultPath)
if err != nil {
return ""
}
var result platform.NvidiaPowerBenchResult
if err := json.Unmarshal(raw, &result); err != nil {
return ""
}
var b strings.Builder
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
if len(result.RecommendedSlotOrder) > 0 {
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
}
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
}
b.WriteString(`</table></div></div>`)
return b.String()
}
func taskBenchmarkResultPath(logText string) string {
archivePath := taskArchivePathFromLog(logText)
if archivePath == "" {
return ""
}
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
if runDir == archivePath {
return ""
}
return filepath.Join(runDir, "result.json")
}

View File

@@ -32,7 +32,8 @@ const (
var taskNames = map[string]string{
"nvidia": "NVIDIA SAT",
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"nvidia-benchmark": "NVIDIA Benchmark",
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
"nvidia-bench-power": "NVIDIA Bee Bench Power",
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -628,7 +629,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
dur = 300
}
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-benchmark":
case "nvidia-bench-perf":
if a == nil {
err = fmt.Errorf("app not configured")
break
@@ -644,6 +645,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID,
}, j.append)
case "nvidia-bench-power":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
RampStep: t.params.RampStep,
RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID,
}, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")

View File

@@ -366,7 +366,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
taskReportMetricsDBPath = metricsPath
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
t.Fatal(err)
}
@@ -398,14 +398,14 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
}
task := &Task{
ID: "task-bench",
Name: "NVIDIA Benchmark",
Target: "nvidia-benchmark",
Name: "NVIDIA Bee Bench Perf",
Target: "nvidia-bench-perf",
Status: TaskDone,
CreatedAt: time.Now().UTC().Add(-time.Minute),
ArtifactsDir: artifactsDir,
}
ensureTaskReportPaths(task)
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
t.Fatal(err)
}
@@ -420,7 +420,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
}
html := string(body)
for _, needle := range []string{
`Benchmark Results`,
`Perf Results`,
`Composite score for this benchmark task.`,
`GPU 0`,
`1176.25`,

View File

@@ -1,5 +1,34 @@
# Benchmark clock calibration research
## Benchmark methodology versioning
Every benchmark methodology change must bump the benchmark version constant in
source code by exactly `+1`.
Methodology change means any change that affects comparability of benchmark
results, including for example:
- phase durations or phase order
- enabled/disabled precisions
- fallback rules
- normalization rules
- score formulas or weights
- degradation thresholds
- power calibration logic
- thermal/power penalty logic
Requirements:
- benchmark version must be stored in source code as an explicit version
constant, not inferred from git tag or build metadata
- benchmark report must always print the benchmark version
- `result.json` must always include the benchmark version
- results from different benchmark versions must be treated as non-comparable by
default
Purpose:
- prevent accidental comparison of runs produced by different methodologies
- make historical benchmark archives self-describing even when detached from git
- force deliberate version bumps whenever scoring or execution semantics change
## Status
In progress. Baseline data from production servers pending.

View File

@@ -642,6 +642,20 @@ static const struct profile_desc k_profiles[] = {
CUDA_R_16F,
CUBLAS_COMPUTE_32F_FAST_16F,
},
{
"int8_tensor",
"int8",
75,
1,
0,
0,
128,
CUDA_R_8I,
CUDA_R_8I,
CUDA_R_32I,
CUDA_R_32I,
CUBLAS_COMPUTE_32I,
},
{
"fp8_e4m3",
"fp8",
@@ -760,10 +774,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
switch (type) {
case CUDA_R_32F:
case CUDA_R_32I:
return (size_t)(elements * 4u);
case CUDA_R_16F:
case CUDA_R_16BF:
return (size_t)(elements * 2u);
case CUDA_R_8I:
case CUDA_R_8F_E4M3:
case CUDA_R_8F_E5M2:
return (size_t)(elements);
@@ -776,6 +792,16 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
}
}
static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
if (desc->compute_type == CUBLAS_COMPUTE_32I) {
return CUDA_R_32I;
}
if (desc->compute_type == CUBLAS_COMPUTE_64F) {
return CUDA_R_64F;
}
return CUDA_R_32F;
}
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
uint64_t row_tiles = (rows + 127u) / 128u;
uint64_t col_tiles = (cols + 63u) / 64u;
@@ -944,8 +970,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
return 0;
}
cudaDataType_t scale_type = matmul_scale_type(desc);
if (!check_cublas("cublasLtMatmulDescCreate",
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
destroy_profile(cublas, cuda, out);
return 0;
}
@@ -1094,17 +1121,30 @@ static int prepare_profile(struct cublaslt_api *cublas,
static int run_cublas_profile(cublasLtHandle_t handle,
struct cublaslt_api *cublas,
struct prepared_profile *profile) {
int32_t alpha_i32 = 1;
int32_t beta_i32 = 0;
double alpha_f64 = 1.0;
double beta_f64 = 0.0;
float alpha = 1.0f;
float beta = 0.0f;
const void *alpha_ptr = &alpha;
const void *beta_ptr = &beta;
if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
alpha_ptr = &alpha_i32;
beta_ptr = &beta_i32;
} else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
alpha_ptr = &alpha_f64;
beta_ptr = &beta_f64;
}
return check_cublas(profile->desc.name,
cublas->cublasLtMatmul(handle,
profile->op_desc,
&alpha,
alpha_ptr,
(const void *)(uintptr_t)profile->a_dev,
profile->a_layout,
(const void *)(uintptr_t)profile->b_dev,
profile->b_layout,
&beta,
beta_ptr,
(const void *)(uintptr_t)profile->c_dev,
profile->c_layout,
(void *)(uintptr_t)profile->d_dev,
@@ -1359,11 +1399,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
}
#endif
static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
printf("device=%s\n", report->device);
printf("device_index=%d\n", device_index);
printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
printf("backend=%s\n", report->backend);
printf("duration_s=%d\n", seconds);
printf("buffer_mb=%d\n", report->buffer_mb);
printf("streams=%d\n", report->stream_count);
printf("iterations=%lu\n", report->iterations);
printf("checksum=%llu\n", (unsigned long long)report->checksum);
if (report->details[0] != '\0') {
printf("%s", report->details);
}
printf("status=OK\n");
}
int main(int argc, char **argv) {
int seconds = 5;
int size_mb = 64;
int device_index = 0;
const char *precision_filter = NULL; /* NULL = all; else block_label to match */
const char *precision_plan = NULL;
const char *precision_plan_seconds = NULL;
for (int i = 1; i < argc; i++) {
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
seconds = atoi(argv[++i]);
@@ -1373,9 +1431,13 @@ int main(int argc, char **argv) {
device_index = atoi(argv[++i]);
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
precision_filter = argv[++i];
} else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
precision_plan = argv[++i];
} else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
precision_plan_seconds = argv[++i];
} else {
fprintf(stderr,
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision fp8|fp16|fp32|fp64|fp4]\n",
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
argv[0]);
return 2;
}
@@ -1436,6 +1498,76 @@ int main(int argc, char **argv) {
int ok = 0;
#if HAVE_CUBLASLT_HEADERS
if (precision_plan != NULL && precision_plan[0] != '\0') {
char *plan_copy = strdup(precision_plan);
char *plan_seconds_copy = NULL;
int phase_seconds[32] = {0};
int phase_seconds_count = 0;
int phase_ok = 0;
if (plan_copy == NULL) {
fprintf(stderr, "failed to allocate precision plan buffer\n");
return 1;
}
if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
plan_seconds_copy = strdup(precision_plan_seconds);
if (plan_seconds_copy == NULL) {
free(plan_copy);
fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
return 1;
}
for (char *sec_token = strtok(plan_seconds_copy, ",");
sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
sec_token = strtok(NULL, ",")) {
while (*sec_token == ' ' || *sec_token == '\t') {
sec_token++;
}
if (*sec_token == '\0') {
continue;
}
phase_seconds[phase_seconds_count++] = atoi(sec_token);
}
}
int phase_idx = 0;
for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
while (*token == ' ' || *token == '\t') {
token++;
}
if (*token == '\0') {
continue;
}
const char *phase_name = token;
const char *phase_filter = token;
if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
phase_filter = NULL;
}
int phase_duration = seconds;
if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
phase_duration = phase_seconds[phase_idx];
}
printf("phase_begin=%s\n", phase_name);
fflush(stdout);
memset(&report, 0, sizeof(report));
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
if (ok) {
print_stress_report(&report, device_index, phase_duration);
phase_ok = 1;
} else {
printf("phase_error=%s\n", phase_name);
if (report.details[0] != '\0') {
printf("%s", report.details);
if (report.details[strlen(report.details) - 1] != '\n') {
printf("\n");
}
}
printf("status=FAILED\n");
}
printf("phase_end=%s\n", phase_name);
fflush(stdout);
}
free(plan_seconds_copy);
free(plan_copy);
return phase_ok ? 0 : 1;
}
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
#endif
if (!ok) {
@@ -1454,18 +1586,6 @@ int main(int argc, char **argv) {
}
}
printf("device=%s\n", report.device);
printf("device_index=%d\n", device_index);
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
printf("backend=%s\n", report.backend);
printf("duration_s=%d\n", seconds);
printf("buffer_mb=%d\n", report.buffer_mb);
printf("streams=%d\n", report.stream_count);
printf("iterations=%lu\n", report.iterations);
printf("checksum=%llu\n", (unsigned long long)report.checksum);
if (report.details[0] != '\0') {
printf("%s", report.details);
}
printf("status=OK\n");
print_stress_report(&report, device_index, seconds);
return 0;
}

View File

@@ -7,10 +7,12 @@ SIZE_MB=0
DEVICES=""
EXCLUDE=""
PRECISION=""
PRECISION_PLAN=""
PRECISION_PLAN_SECONDS=""
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
usage() {
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision fp8|fp16|fp32|fp64|fp4]" >&2
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]" >&2
exit 2
}
@@ -32,6 +34,8 @@ while [ "$#" -gt 0 ]; do
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
--precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
--precision-plan) [ "$#" -ge 2 ] || usage; PRECISION_PLAN="$2"; shift 2 ;;
--precision-plan-seconds) [ "$#" -ge 2 ] || usage; PRECISION_PLAN_SECONDS="$2"; shift 2 ;;
*) usage ;;
esac
done
@@ -92,8 +96,12 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
precision_arg=""
[ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
precision_plan_arg=""
[ -n "${PRECISION_PLAN}" ] && precision_plan_arg="--precision-plan ${PRECISION_PLAN}"
precision_plan_seconds_arg=""
[ -n "${PRECISION_PLAN_SECONDS}" ] && precision_plan_seconds_arg="--precision-plan-seconds ${PRECISION_PLAN_SECONDS}"
CUDA_VISIBLE_DEVICES="${id}" \
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} >"${log}" 2>&1 &
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} ${precision_plan_arg} ${precision_plan_seconds_arg} >"${log}" 2>&1 &
pid=$!
WORKERS="${WORKERS} ${pid}:${id}:${log}"
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then