From b4990a85b3a6569006a3c818998cdb58307b4a9b Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Thu, 16 Apr 2026 13:04:34 +0300 Subject: [PATCH] one more pass add --- perf/perf-20260416-080820/result.json | 1206 +++++++++++++++++++++++ power/power-20260416-073931/result.json | 124 +++ power/power-20260416-081628/result.json | 132 +++ 3 files changed, 1462 insertions(+) create mode 100644 perf/perf-20260416-080820/result.json create mode 100644 power/power-20260416-073931/result.json create mode 100644 power/power-20260416-081628/result.json diff --git a/perf/perf-20260416-080820/result.json b/perf/perf-20260416-080820/result.json new file mode 100644 index 0000000..f3518cd --- /dev/null +++ b/perf/perf-20260416-080820/result.json @@ -0,0 +1,1206 @@ +{ + "benchmark_version": "2", + "generated_at": "2026-04-16T08:08:20.63072288Z", + "hostname": "debian", + "server_model": "MLT-S06", + "benchmark_profile": "standard", + "parallel_gpus": true, + "ramp_total": 4, + "ramp_run_id": "ramp-20260416-072956", + "overall_status": "FAILED", + "selected_gpu_indices": [ + 0, + 1, + 2, + 3 + ], + "findings": [ + "0 of 4 GPU(s) passed the benchmark.", + "[POWER] GPU 0: power cap throttle 100.0% of steady state — server is not delivering full TDP to the GPU.", + "[THERMAL] GPU 0: thermal throttle 100.0% of steady state.", + "GPU 0 average SM clock stayed below the requested lock target.", + "GPU 0 had incomplete precision coverage: fp64:FAILED, fp4:FAILED.", + "[POWER] GPU 1: power cap throttle 100.0% of steady state — server is not delivering full TDP to the GPU.", + "[THERMAL] GPU 1: thermal throttle 100.0% of steady state.", + "GPU 1 average SM clock stayed below the requested lock target.", + "GPU 1 had incomplete precision coverage: fp64:FAILED, fp4:FAILED.", + "[POWER] GPU 2: power cap throttle 100.0% of steady state — server is not delivering full TDP to the GPU.", + "[THERMAL] GPU 2: thermal throttle 100.0% of steady state.", + "GPU 2 average SM clock stayed below the requested lock target.", + "GPU 2 had incomplete precision coverage: fp64:FAILED, fp4:FAILED.", + "[POWER] GPU 3: power cap throttle 100.0% of steady state — server is not delivering full TDP to the GPU.", + "[THERMAL] GPU 3: thermal throttle 100.0% of steady state.", + "GPU 3 average SM clock stayed below the requested lock target.", + "GPU 3 had incomplete precision coverage: fp64:FAILED, fp4:FAILED.", + "Multi-GPU all_reduce max bus bandwidth: 8.5 GB/s." + ], + "normalization": { + "status": "full", + "gpus": [ + { + "index": 0, + "persistence_mode": "applied", + "gpu_clock_lock_mhz": 1980, + "gpu_clock_lock_status": "applied", + "memory_clock_lock_mhz": 2619, + "memory_clock_lock_status": "applied" + }, + { + "index": 1, + "persistence_mode": "applied", + "gpu_clock_lock_mhz": 1980, + "gpu_clock_lock_status": "applied", + "memory_clock_lock_mhz": 2619, + "memory_clock_lock_status": "applied" + }, + { + "index": 2, + "persistence_mode": "applied", + "gpu_clock_lock_mhz": 1980, + "gpu_clock_lock_status": "applied", + "memory_clock_lock_mhz": 2619, + "memory_clock_lock_status": "applied" + }, + { + "index": 3, + "persistence_mode": "applied", + "gpu_clock_lock_mhz": 1980, + "gpu_clock_lock_status": "applied", + "memory_clock_lock_mhz": 2619, + "memory_clock_lock_status": "applied" + } + ] + }, + "host_config": { + "cpu_model": "Intel(R) Xeon(R) Gold 6430", + "cpu_sockets": 2, + "cpu_cores": 64, + "cpu_threads": 128, + "mem_total_gib": 62.53376007080078 + }, + "cpu_load": { + "avg_pct": 2.3, + "max_pct": 3.5, + "p95_pct": 3.2, + "samples": 45, + "status": "ok" + }, + "cooling": { + "available": true, + "avg_fan_rpm": 6523.529411764706, + "notes": [ + "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected" + ] + }, + "gpus": [ + { + "index": 0, + "uuid": "GPU-a87b1588-4e92-bd7b-74be-9db000808a64", + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:49:00.0", + "vbios": "96.00.74.00.01", + "status": "PARTIAL", + "power_limit_w": 700, + "default_power_limit_w": 700, + "max_graphics_clock_mhz": 1980, + "max_memory_clock_mhz": 2619, + "locked_graphics_clock_mhz": 1980, + "locked_memory_clock_mhz": 2619, + "baseline": { + "duration_sec": 18.67612165, + "samples": 1, + "avg_temp_c": 41, + "p95_temp_c": 41, + "avg_power_w": 138.02, + "p95_power_w": 138.02, + "avg_graphics_clock_mhz": 1830, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "steady": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "precision_steady": [ + { + "precision": "int8", + "status": "OK", + "steady": { + "duration_sec": 53.666271418, + "samples": 2, + "avg_temp_c": 51.5, + "p95_temp_c": 59.15, + "avg_power_w": 448.345, + "p95_power_w": 674.3395, + "avg_graphics_clock_mhz": 1522.5, + "p95_graphics_clock_mhz": 1799.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 50.5, + "avg_mem_usage_pct": 43.5, + "clock_cv_pct": 20.19704433497537, + "power_cv_pct": 56.00709275223321, + "temp_cv_pct": 16.50485436893204, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1151.669844836352, + "weighted_teraops_per_sec": 287.917461209088, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp8", + "status": "OK", + "steady": { + "duration_sec": 94.749526187, + "samples": 2, + "avg_temp_c": 68, + "p95_temp_c": 69.8, + "avg_power_w": 699.245, + "p95_power_w": 699.3575, + "avg_graphics_clock_mhz": 1162.5, + "p95_graphics_clock_mhz": 1196.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 75.5, + "clock_cv_pct": 3.225806451612903, + "power_cv_pct": 0.017876423857160224, + "temp_cv_pct": 2.941176470588235, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1418.4876727074816, + "weighted_teraops_per_sec": 354.6219181768704, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp16", + "status": "OK", + "steady": { + "duration_sec": 160.278286598, + "samples": 3, + "avg_temp_c": 75.66666666666667, + "p95_temp_c": 77.8, + "avg_power_w": 699.0766666666667, + "p95_power_w": 699.643, + "avg_graphics_clock_mhz": 1140, + "p95_graphics_clock_mhz": 1140, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 66.33333333333333, + "clock_cv_pct": 0, + "power_cv_pct": 0.07192548770429025, + "temp_cv_pct": 2.715600882365188, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 605.7576061293909, + "weighted_teraops_per_sec": 302.87880306469543, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp32", + "status": "OK", + "steady": { + "duration_sec": 234.413351493, + "samples": 3, + "avg_temp_c": 79, + "p95_temp_c": 79.9, + "avg_power_w": 698.8633333333333, + "p95_power_w": 698.938, + "avg_graphics_clock_mhz": 1205, + "p95_graphics_clock_mhz": 1239, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 100, + "clock_cv_pct": 2.3472424271752614, + "power_cv_pct": 0.008846382454843477, + "temp_cv_pct": 1.0335399758578812, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 329.0028834816, + "weighted_teraops_per_sec": 329.0028834816, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp64", + "status": "FAILED", + "steady": { + "duration_sec": 281.842154315, + "samples": 2, + "avg_temp_c": 81.5, + "p95_temp_c": 81.95, + "avg_power_w": 698.085, + "p95_power_w": 698.2245, + "avg_graphics_clock_mhz": 1170, + "p95_graphics_clock_mhz": 1197, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 89.5, + "clock_cv_pct": 2.564102564102564, + "power_cv_pct": 0.022203599848160258, + "temp_cv_pct": 0.6134969325153374, + "clock_drift_pct": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "notes": "precision phase failed" + }, + { + "precision": "fp4", + "status": "FAILED", + "steady": { + "duration_sec": 323.470255195, + "samples": 2, + "avg_temp_c": 80.5, + "p95_temp_c": 82.75, + "avg_power_w": 481.03499999999997, + "p95_power_w": 677.4465, + "avg_graphics_clock_mhz": 1507.5, + "p95_graphics_clock_mhz": 1797.75, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 50, + "avg_mem_usage_pct": 37, + "clock_cv_pct": 21.393034825870647, + "power_cv_pct": 45.36780067978422, + "temp_cv_pct": 3.1055900621118013, + "clock_drift_pct": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "notes": "precision phase failed" + } + ], + "precision_failures": [ + "fp64:FAILED", + "fp4:FAILED" + ], + "cooldown": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "throttle_counters": { + "sw_power_cap_us": 200018, + "sw_thermal_slowdown_us": 200019, + "sync_boost_us": 0, + "hw_thermal_slowdown_us": 0, + "hw_power_brake_slowdown_us": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "scores": { + "compute_score": 1274.421065932254, + "synthetic_score": 1274.421065932254, + "power_sustain_score": 66.60624071973443, + "thermal_sustain_score": 100, + "stability_score": 0, + "thermal_throttle_pct": 100, + "power_cap_throttle_pct": 100, + "temp_headroom_c": 0, + "interconnect_score": 8.54, + "server_quality_score": 49.98187221592033, + "composite_score": 1274.421065932254 + }, + "degradation_reasons": [ + "power_capped", + "thermal_limited", + "low_sm_clock_vs_target" + ], + "notes": [ + "parallel warmup failed: exit status 1" + ] + }, + { + "index": 1, + "uuid": "GPU-30b320aa-c18f-6b25-d0ed-24aeb14f1fd3", + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:5A:00.0", + "vbios": "96.00.74.00.01", + "status": "PARTIAL", + "power_limit_w": 700, + "default_power_limit_w": 700, + "max_graphics_clock_mhz": 1980, + "max_memory_clock_mhz": 2619, + "locked_graphics_clock_mhz": 1980, + "locked_memory_clock_mhz": 2619, + "baseline": { + "duration_sec": 18.67612165, + "samples": 1, + "avg_temp_c": 44, + "p95_temp_c": 44, + "avg_power_w": 139.47, + "p95_power_w": 139.47, + "avg_graphics_clock_mhz": 1830, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "steady": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "precision_steady": [ + { + "precision": "int8", + "status": "OK", + "steady": { + "duration_sec": 53.666271418, + "samples": 2, + "avg_temp_c": 54, + "p95_temp_c": 61.2, + "avg_power_w": 449.475, + "p95_power_w": 673.8045, + "avg_graphics_clock_mhz": 1507.5, + "p95_graphics_clock_mhz": 1797.75, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 50, + "avg_mem_usage_pct": 42.5, + "clock_cv_pct": 21.393034825870647, + "power_cv_pct": 55.454697146671116, + "temp_cv_pct": 14.814814814814813, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1149.1842264662016, + "weighted_teraops_per_sec": 287.2960566165504, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp8", + "status": "OK", + "steady": { + "duration_sec": 94.749526187, + "samples": 2, + "avg_temp_c": 70, + "p95_temp_c": 71.8, + "avg_power_w": 699.095, + "p95_power_w": 699.3605, + "avg_graphics_clock_mhz": 1162.5, + "p95_graphics_clock_mhz": 1169.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 75.5, + "clock_cv_pct": 0.6451612903225806, + "power_cv_pct": 0.04219741236885057, + "temp_cv_pct": 2.857142857142857, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1417.5994524991488, + "weighted_teraops_per_sec": 354.3998631247872, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp16", + "status": "OK", + "steady": { + "duration_sec": 160.278286598, + "samples": 3, + "avg_temp_c": 77, + "p95_temp_c": 78.8, + "avg_power_w": 698.4633333333333, + "p95_power_w": 698.548, + "avg_graphics_clock_mhz": 1105, + "p95_graphics_clock_mhz": 1110, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 62, + "clock_cv_pct": 0.6399156390828484, + "power_cv_pct": 0.010213341935880632, + "temp_cv_pct": 2.1207703400720157, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 605.7576061293909, + "weighted_teraops_per_sec": 302.87880306469543, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp32", + "status": "OK", + "steady": { + "duration_sec": 234.413351493, + "samples": 3, + "avg_temp_c": 80, + "p95_temp_c": 80.9, + "avg_power_w": 699.0533333333333, + "p95_power_w": 699.09, + "avg_graphics_clock_mhz": 1140, + "p95_graphics_clock_mhz": 1140, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 100, + "clock_cv_pct": 0, + "power_cv_pct": 0.007417817041191886, + "temp_cv_pct": 1.0206207261596576, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 328.3913539584, + "weighted_teraops_per_sec": 328.3913539584, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp64", + "status": "FAILED", + "steady": { + "duration_sec": 281.842154315, + "samples": 2, + "avg_temp_c": 81.5, + "p95_temp_c": 81.95, + "avg_power_w": 680.315, + "p95_power_w": 697.4375, + "avg_graphics_clock_mhz": 1117.5, + "p95_graphics_clock_mhz": 1151.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 88.5, + "clock_cv_pct": 3.3557046979865772, + "power_cv_pct": 2.796498680758183, + "temp_cv_pct": 0.6134969325153374, + "clock_drift_pct": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "notes": "precision phase failed" + }, + { + "precision": "fp4", + "status": "FAILED", + "steady": { + "duration_sec": 323.470255195, + "samples": 2, + "avg_temp_c": 83.5, + "p95_temp_c": 83.95, + "avg_power_w": 699.105, + "p95_power_w": 699.5145, + "avg_graphics_clock_mhz": 1147.5, + "p95_graphics_clock_mhz": 1208.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 61, + "clock_cv_pct": 5.88235294117647, + "power_cv_pct": 0.06508321353730613, + "temp_cv_pct": 0.5988023952095809, + "clock_drift_pct": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "notes": "precision phase failed" + } + ], + "precision_failures": [ + "fp64:FAILED", + "fp4:FAILED" + ], + "cooldown": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "throttle_counters": { + "sw_power_cap_us": 100036, + "sw_thermal_slowdown_us": 100036, + "sync_boost_us": 0, + "hw_thermal_slowdown_us": 0, + "hw_power_brake_slowdown_us": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "scores": { + "compute_score": 1272.966076764433, + "synthetic_score": 1272.966076764433, + "power_sustain_score": 81.87262810051264, + "thermal_sustain_score": 100, + "stability_score": 0, + "thermal_throttle_pct": 100, + "power_cap_throttle_pct": 100, + "temp_headroom_c": 0, + "interconnect_score": 8.54, + "server_quality_score": 54.56178843015379, + "composite_score": 1272.966076764433 + }, + "degradation_reasons": [ + "power_capped", + "thermal_limited", + "low_sm_clock_vs_target" + ], + "notes": [ + "parallel warmup failed: exit status 1" + ] + }, + { + "index": 2, + "uuid": "GPU-0182c11c-0c2c-aafd-0ada-113b64147ee6", + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:C8:00.0", + "vbios": "96.00.74.00.01", + "status": "PARTIAL", + "power_limit_w": 700, + "default_power_limit_w": 700, + "max_graphics_clock_mhz": 1980, + "max_memory_clock_mhz": 2619, + "locked_graphics_clock_mhz": 1980, + "locked_memory_clock_mhz": 2619, + "baseline": { + "duration_sec": 18.67612165, + "samples": 1, + "avg_temp_c": 43, + "p95_temp_c": 43, + "avg_power_w": 122.08, + "p95_power_w": 122.08, + "avg_graphics_clock_mhz": 1830, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "steady": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "precision_steady": [ + { + "precision": "int8", + "status": "OK", + "steady": { + "duration_sec": 53.666271418, + "samples": 2, + "avg_temp_c": 53, + "p95_temp_c": 60.2, + "avg_power_w": 447.94, + "p95_power_w": 672.9129999999999, + "avg_graphics_clock_mhz": 1522.5, + "p95_graphics_clock_mhz": 1799.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 50, + "avg_mem_usage_pct": 43, + "clock_cv_pct": 20.19704433497537, + "power_cv_pct": 55.804348796713846, + "temp_cv_pct": 15.09433962264151, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1149.59849619456, + "weighted_teraops_per_sec": 287.39962404864, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp8", + "status": "OK", + "steady": { + "duration_sec": 94.749526187, + "samples": 2, + "avg_temp_c": 69, + "p95_temp_c": 70.8, + "avg_power_w": 699.225, + "p95_power_w": 699.3285000000001, + "avg_graphics_clock_mhz": 1140, + "p95_graphics_clock_mhz": 1180.5, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 73.5, + "clock_cv_pct": 3.9473684210526314, + "power_cv_pct": 0.016446780363975698, + "temp_cv_pct": 2.898550724637681, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1417.5994524991488, + "weighted_teraops_per_sec": 354.3998631247872, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp16", + "status": "OK", + "steady": { + "duration_sec": 160.278286598, + "samples": 3, + "avg_temp_c": 75.33333333333333, + "p95_temp_c": 76.9, + "avg_power_w": 699.0933333333334, + "p95_power_w": 699.518, + "avg_graphics_clock_mhz": 1115, + "p95_graphics_clock_mhz": 1123.5, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 64, + "clock_cv_pct": 0.6341764853691009, + "power_cv_pct": 0.050873430990042545, + "temp_cv_pct": 2.2562033245985775, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 605.7576061293909, + "weighted_teraops_per_sec": 302.87880306469543, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp32", + "status": "OK", + "steady": { + "duration_sec": 234.413351493, + "samples": 3, + "avg_temp_c": 78.66666666666667, + "p95_temp_c": 79.9, + "avg_power_w": 698.96, + "p95_power_w": 699.181, + "avg_graphics_clock_mhz": 1150, + "p95_graphics_clock_mhz": 1167, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 100, + "clock_cv_pct": 1.2297509238026914, + "power_cv_pct": 0.027170690803947822, + "temp_cv_pct": 1.5854480452431954, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 328.7990403072, + "weighted_teraops_per_sec": 328.7990403072, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp64", + "status": "FAILED", + "steady": { + "duration_sec": 281.842154315, + "samples": 2, + "avg_temp_c": 81, + "p95_temp_c": 81.9, + "avg_power_w": 697.7, + "p95_power_w": 698.861, + "avg_graphics_clock_mhz": 1170, + "p95_graphics_clock_mhz": 1210.5, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 85, + "clock_cv_pct": 3.8461538461538463, + "power_cv_pct": 0.18489322058191493, + "temp_cv_pct": 1.2345679012345678, + "clock_drift_pct": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "notes": "precision phase failed" + }, + { + "precision": "fp4", + "status": "FAILED", + "steady": { + "duration_sec": 323.470255195, + "samples": 2, + "avg_temp_c": 82.5, + "p95_temp_c": 82.95, + "avg_power_w": 697.71, + "p95_power_w": 698.268, + "avg_graphics_clock_mhz": 1192.5, + "p95_graphics_clock_mhz": 1212.75, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 64.5, + "clock_cv_pct": 1.8867924528301887, + "power_cv_pct": 0.08886213469779773, + "temp_cv_pct": 0.6060606060606061, + "clock_drift_pct": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "notes": "precision phase failed" + } + ], + "precision_failures": [ + "fp64:FAILED", + "fp4:FAILED" + ], + "cooldown": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "throttle_counters": { + "sw_power_cap_us": 99976, + "sw_thermal_slowdown_us": 99976, + "sync_boost_us": 0, + "hw_thermal_slowdown_us": 0, + "hw_power_brake_slowdown_us": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "scores": { + "compute_score": 1273.4773305453227, + "synthetic_score": 1273.4773305453227, + "power_sustain_score": 83.14921020461449, + "thermal_sustain_score": 100, + "stability_score": 0, + "thermal_throttle_pct": 100, + "power_cap_throttle_pct": 100, + "temp_headroom_c": 0, + "interconnect_score": 8.54, + "server_quality_score": 54.944763061384336, + "composite_score": 1273.4773305453227 + }, + "degradation_reasons": [ + "power_capped", + "thermal_limited", + "low_sm_clock_vs_target" + ], + "notes": [ + "parallel warmup failed: exit status 1" + ] + }, + { + "index": 3, + "uuid": "GPU-9ee0af22-3dda-6f5c-1a13-35c63f324216", + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:D8:00.0", + "vbios": "96.00.74.00.01", + "status": "PARTIAL", + "power_limit_w": 700, + "default_power_limit_w": 700, + "max_graphics_clock_mhz": 1980, + "max_memory_clock_mhz": 2619, + "locked_graphics_clock_mhz": 1980, + "locked_memory_clock_mhz": 2619, + "baseline": { + "duration_sec": 18.67612165, + "samples": 1, + "avg_temp_c": 41, + "p95_temp_c": 41, + "avg_power_w": 98.14, + "p95_power_w": 98.14, + "avg_graphics_clock_mhz": 1830, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "steady": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "precision_steady": [ + { + "precision": "int8", + "status": "OK", + "steady": { + "duration_sec": 53.666271418, + "samples": 2, + "avg_temp_c": 51, + "p95_temp_c": 58.2, + "avg_power_w": 445.47, + "p95_power_w": 672.252, + "avg_graphics_clock_mhz": 1522.5, + "p95_graphics_clock_mhz": 1799.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 50, + "avg_mem_usage_pct": 42, + "clock_cv_pct": 20.19704433497537, + "power_cv_pct": 56.56497631714819, + "temp_cv_pct": 15.686274509803921, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1149.59849619456, + "weighted_teraops_per_sec": 287.39962404864, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp8", + "status": "OK", + "steady": { + "duration_sec": 94.749526187, + "samples": 2, + "avg_temp_c": 67, + "p95_temp_c": 68.8, + "avg_power_w": 698.51, + "p95_power_w": 699.3470000000001, + "avg_graphics_clock_mhz": 1162.5, + "p95_graphics_clock_mhz": 1196.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 73.5, + "clock_cv_pct": 3.225806451612903, + "power_cv_pct": 0.13314054201085265, + "temp_cv_pct": 2.9850746268656714, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1417.5994524991488, + "weighted_teraops_per_sec": 354.3998631247872, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp16", + "status": "OK", + "steady": { + "duration_sec": 160.278286598, + "samples": 3, + "avg_temp_c": 73.66666666666667, + "p95_temp_c": 74.9, + "avg_power_w": 699.1333333333332, + "p95_power_w": 699.4029999999999, + "avg_graphics_clock_mhz": 1140, + "p95_graphics_clock_mhz": 1153.5, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 62, + "clock_cv_pct": 1.07433760648385, + "power_cv_pct": 0.03630428114961699, + "temp_cv_pct": 1.6930576410741816, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 605.7576061293909, + "weighted_teraops_per_sec": 302.87880306469543, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp32", + "status": "OK", + "steady": { + "duration_sec": 234.413351493, + "samples": 3, + "avg_temp_c": 76.33333333333333, + "p95_temp_c": 77, + "avg_power_w": 699.0499999999998, + "p95_power_w": 699.372, + "avg_graphics_clock_mhz": 1185, + "p95_graphics_clock_mhz": 1198.5, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 100, + "clock_cv_pct": 1.0335399758578812, + "power_cv_pct": 0.04093033637817591, + "temp_cv_pct": 1.2351210151730088, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 328.7990403072, + "weighted_teraops_per_sec": 328.7990403072, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp64", + "status": "FAILED", + "steady": { + "duration_sec": 281.842154315, + "samples": 2, + "avg_temp_c": 78.5, + "p95_temp_c": 78.95, + "avg_power_w": 697.9100000000001, + "p95_power_w": 698.63, + "avg_graphics_clock_mhz": 1155, + "p95_graphics_clock_mhz": 1168.5, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 86.5, + "clock_cv_pct": 1.2987012987012987, + "power_cv_pct": 0.11462796062529715, + "temp_cv_pct": 0.6369426751592357, + "clock_drift_pct": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "notes": "precision phase failed" + }, + { + "precision": "fp4", + "status": "FAILED", + "steady": { + "duration_sec": 323.470255195, + "samples": 2, + "avg_temp_c": 80, + "p95_temp_c": 80, + "avg_power_w": 696.935, + "p95_power_w": 697.1645000000001, + "avg_graphics_clock_mhz": 1185, + "p95_graphics_clock_mhz": 1198.5, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 70.5, + "clock_cv_pct": 1.2658227848101267, + "power_cv_pct": 0.036588778006564786, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "notes": "precision phase failed" + } + ], + "precision_failures": [ + "fp64:FAILED", + "fp4:FAILED" + ], + "cooldown": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "throttle_counters": { + "sw_power_cap_us": 199978, + "sw_thermal_slowdown_us": 199978, + "sync_boost_us": 0, + "hw_thermal_slowdown_us": 0, + "hw_power_brake_slowdown_us": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "scores": { + "compute_score": 1273.4773305453227, + "synthetic_score": 1273.4773305453227, + "power_sustain_score": 83.15253738424808, + "thermal_sustain_score": 100, + "stability_score": 0, + "thermal_throttle_pct": 100, + "power_cap_throttle_pct": 100, + "temp_headroom_c": 0, + "interconnect_score": 8.54, + "server_quality_score": 54.94576121527442, + "composite_score": 1273.4773305453227 + }, + "degradation_reasons": [ + "power_capped", + "thermal_limited", + "low_sm_clock_vs_target" + ], + "notes": [ + "parallel warmup failed: exit status 1" + ] + } + ], + "interconnect": { + "status": "OK", + "attempted": true, + "supported": true, + "selected_gpu_indices": [ + 0, + 1, + 2, + 3 + ], + "avg_algbw_gbps": 5.6375, + "max_algbw_gbps": 5.69, + "avg_busbw_gbps": 8.4575, + "max_busbw_gbps": 8.54 + }, + "server_power": { + "available": false, + "notes": [ + "IPMI power reading unavailable; server-side power characterization skipped" + ] + } +} \ No newline at end of file diff --git a/power/power-20260416-073931/result.json b/power/power-20260416-073931/result.json new file mode 100644 index 0000000..c126aa1 --- /dev/null +++ b/power/power-20260416-073931/result.json @@ -0,0 +1,124 @@ +{ + "benchmark_version": "2", + "generated_at": "2026-04-16T07:39:31.70304016Z", + "hostname": "debian", + "server_model": "MLT-S06", + "benchmark_profile": "standard", + "selected_gpu_indices": [ + 0, + 1, + 2, + 3 + ], + "recommended_slot_order": [ + 3, + 2, + 1, + 0 + ], + "ramp_steps": [ + { + "step_index": 1, + "gpu_indices": [ + 3 + ], + "new_gpu_index": 3, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 699.746, + "avg_observed_power_w": 699.746, + "status": "OK" + }, + { + "step_index": 2, + "gpu_indices": [ + 3, + 2 + ], + "new_gpu_index": 2, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 698.9375, + "avg_observed_power_w": 349.46875, + "status": "OK" + }, + { + "step_index": 3, + "gpu_indices": [ + 3, + 2, + 1 + ], + "new_gpu_index": 1, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 698.9350000000001, + "avg_observed_power_w": 232.97833333333335, + "status": "OK" + }, + { + "step_index": 4, + "gpu_indices": [ + 3, + 2, + 1, + 0 + ], + "new_gpu_index": 0, + "new_gpu_stable_limit_w": 575, + "total_observed_power_w": 574.178, + "avg_observed_power_w": 143.5445, + "derated": true, + "status": "PARTIAL" + } + ], + "overall_status": "PARTIAL", + "platform_max_tdp_w": 2675, + "findings": [ + "Recommended slot order for installation based on single-card targeted_power: 3,2,1,0.", + "Ramp step 4 (GPU 0) required derating to 575 W under combined thermal load." + ], + "gpus": [ + { + "index": 3, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:D8:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 700, + "max_observed_power_w": 699.746, + "max_observed_temp_c": 73.8, + "calibration_attempts": 1, + "status": "OK" + }, + { + "index": 2, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:C8:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 700, + "max_observed_power_w": 699.2909999999999, + "max_observed_temp_c": 78.1, + "calibration_attempts": 1, + "status": "OK" + }, + { + "index": 1, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:5A:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 700, + "max_observed_power_w": 698.86, + "max_observed_temp_c": 78.6, + "calibration_attempts": 1, + "status": "OK" + }, + { + "index": 0, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:49:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 575, + "max_observed_power_w": 697.712, + "max_observed_temp_c": 76.8, + "calibration_attempts": 1, + "status": "OK" + } + ] +} \ No newline at end of file diff --git a/power/power-20260416-081628/result.json b/power/power-20260416-081628/result.json new file mode 100644 index 0000000..72266db --- /dev/null +++ b/power/power-20260416-081628/result.json @@ -0,0 +1,132 @@ +{ + "benchmark_version": "2", + "generated_at": "2026-04-16T08:16:28.673449017Z", + "hostname": "debian", + "server_model": "MLT-S06", + "benchmark_profile": "stability", + "selected_gpu_indices": [ + 0, + 1, + 2, + 3 + ], + "recommended_slot_order": [ + 1, + 2, + 3, + 0 + ], + "ramp_steps": [ + { + "step_index": 1, + "gpu_indices": [ + 1 + ], + "new_gpu_index": 1, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 699.1899999999999, + "avg_observed_power_w": 699.1899999999999, + "status": "OK" + }, + { + "step_index": 2, + "gpu_indices": [ + 1, + 2 + ], + "new_gpu_index": 2, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 698.405, + "avg_observed_power_w": 349.2025, + "status": "OK" + }, + { + "step_index": 3, + "gpu_indices": [ + 1, + 2, + 3 + ], + "new_gpu_index": 3, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 699.54, + "avg_observed_power_w": 233.17999999999998, + "status": "OK" + }, + { + "step_index": 4, + "gpu_indices": [ + 1, + 2, + 3, + 0 + ], + "new_gpu_index": 0, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 669.28, + "avg_observed_power_w": 167.32, + "status": "OK" + } + ], + "overall_status": "PARTIAL", + "platform_max_tdp_w": 2800, + "findings": [ + "Recommended slot order for installation based on single-card targeted_power: 1,2,3,0.", + "GPU 0 required reduced power limit 670 W to complete targeted_power." + ], + "gpus": [ + { + "index": 1, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:5A:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 700, + "max_observed_power_w": 699.1899999999999, + "max_observed_temp_c": 78.75, + "calibration_attempts": 1, + "status": "OK" + }, + { + "index": 2, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:C8:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 700, + "max_observed_power_w": 699.088, + "max_observed_temp_c": 77.4, + "calibration_attempts": 1, + "status": "OK" + }, + { + "index": 3, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:D8:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 700, + "max_observed_power_w": 698.5699999999999, + "max_observed_temp_c": 74.95, + "calibration_attempts": 1, + "status": "OK" + }, + { + "index": 0, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:49:00.0", + "applied_power_limit_w": 670, + "stable_power_limit_w": 700, + "max_observed_power_w": 668.956, + "max_observed_temp_c": 82.19999999999999, + "calibration_attempts": 5, + "derated": true, + "status": "PARTIAL", + "notes": [ + "targeted_power attempt 1: sw_thermal throttle at 700 W", + "binary search: trying 625 W (lo=550 hi=700)", + "binary search: stable at 625 W, trying 660 W (lo=625 hi=700)", + "binary search: stable at 660 W, trying 680 W (lo=660 hi=700)", + "targeted_power attempt 4: sw_thermal throttle at 680 W", + "binary search: trying 670 W (lo=660 hi=680)" + ] + } + ] +} \ No newline at end of file