diff --git a/perf/perf-20260416-142113/result.json b/perf/perf-20260416-142113/result.json new file mode 100644 index 0000000..2323278 --- /dev/null +++ b/perf/perf-20260416-142113/result.json @@ -0,0 +1,1238 @@ +{ + "benchmark_version": "2", + "generated_at": "2026-04-16T14:21:13.433467372Z", + "hostname": "debian", + "server_model": "MLT-S06", + "benchmark_profile": "standard", + "parallel_gpus": true, + "ramp_total": 4, + "ramp_run_id": "ramp-20260416-133118", + "overall_status": "OK", + "selected_gpu_indices": [ + 0, + 1, + 2, + 3 + ], + "findings": [ + "All 4 GPU(s) passed the benchmark.", + "GPU 0 average SM clock stayed below the requested lock target.", + "[HARD STOP] GPU 0: p95 temperature 85.0°C — only 5.0°C from shutdown threshold (90°C). Do not operate.", + "GPU 1 average SM clock stayed below the requested lock target.", + "[HARD STOP] GPU 1: p95 temperature 85.0°C — only 5.0°C from shutdown threshold (90°C). Do not operate.", + "GPU 2 average SM clock stayed below the requested lock target.", + "[HARD STOP] GPU 2: p95 temperature 83.0°C — only 7.0°C from shutdown threshold (90°C). Do not operate.", + "GPU 3 average SM clock stayed below the requested lock target.", + "[HARD STOP] GPU 3: p95 temperature 82.0°C — only 8.0°C from shutdown threshold (90°C). Do not operate.", + "Multi-GPU all_reduce max bus bandwidth: 8.6 GB/s." + ], + "normalization": { + "status": "full", + "gpus": [ + { + "index": 0, + "persistence_mode": "applied", + "gpu_clock_lock_mhz": 1980, + "gpu_clock_lock_status": "applied", + "memory_clock_lock_mhz": 2619, + "memory_clock_lock_status": "applied" + }, + { + "index": 1, + "persistence_mode": "applied", + "gpu_clock_lock_mhz": 1980, + "gpu_clock_lock_status": "applied", + "memory_clock_lock_mhz": 2619, + "memory_clock_lock_status": "applied" + }, + { + "index": 2, + "persistence_mode": "applied", + "gpu_clock_lock_mhz": 1980, + "gpu_clock_lock_status": "applied", + "memory_clock_lock_mhz": 2619, + "memory_clock_lock_status": "applied" + }, + { + "index": 3, + "persistence_mode": "applied", + "gpu_clock_lock_mhz": 1980, + "gpu_clock_lock_status": "applied", + "memory_clock_lock_mhz": 2619, + "memory_clock_lock_status": "applied" + } + ] + }, + "host_config": { + "cpu_model": "Intel(R) Xeon(R) Gold 6430", + "cpu_sockets": 2, + "cpu_cores": 64, + "cpu_threads": 128, + "mem_total_gib": 62.53376007080078 + }, + "cpu_load": { + "avg_pct": 2.5, + "max_pct": 3.4, + "p95_pct": 3.3, + "samples": 88, + "status": "ok" + }, + "cooling": { + "available": true, + "avg_fan_rpm": 6445.138888888889, + "fan_duty_cycle_available": true, + "fan_duty_cycle_estimated": true, + "avg_fan_duty_cycle_pct": 87.26521164021146, + "p95_fan_duty_cycle_pct": 95.65252976190474, + "notes": [ + "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading" + ] + }, + "gpus": [ + { + "index": 0, + "uuid": "GPU-a87b1588-4e92-bd7b-74be-9db000808a64", + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:49:00.0", + "vbios": "96.00.74.00.01", + "compute_capability": "9.0", + "backend": "cublasLt", + "status": "OK", + "power_limit_w": 700, + "default_power_limit_w": 700, + "max_graphics_clock_mhz": 1980, + "max_memory_clock_mhz": 2619, + "locked_graphics_clock_mhz": 1980, + "locked_memory_clock_mhz": 2619, + "baseline": { + "duration_sec": 20.487515832, + "samples": 1, + "avg_temp_c": 69, + "p95_temp_c": 69, + "avg_power_w": 181.66, + "p95_power_w": 181.66, + "avg_graphics_clock_mhz": 1830, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "steady": { + "duration_sec": 715.865398029, + "samples": 21, + "avg_temp_c": 81.23809523809524, + "p95_temp_c": 85, + "avg_power_w": 629.2452380952379, + "p95_power_w": 697.96, + "avg_graphics_clock_mhz": 1300.7142857142858, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 85.71428571428571, + "avg_mem_usage_pct": 50.61904761904762, + "clock_cv_pct": 17.027510138414932, + "power_cv_pct": 25.700013778114617, + "temp_cv_pct": 8.408169321004037, + "clock_drift_pct": 0 + }, + "precision_steady": [ + { + "precision": "int8", + "status": "OK", + "steady": { + "duration_sec": 51.349862924, + "samples": 2, + "avg_temp_c": 65.5, + "p95_temp_c": 70.45, + "avg_power_w": 458.21500000000003, + "p95_power_w": 675.1015, + "avg_graphics_clock_mhz": 1507.5, + "p95_graphics_clock_mhz": 1797.75, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 50.5, + "avg_mem_usage_pct": 42.5, + "clock_cv_pct": 21.393034825870647, + "power_cv_pct": 52.59212378468623, + "temp_cv_pct": 8.396946564885496, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1149.1842264662016, + "weighted_teraops_per_sec": 287.2960566165504, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp8", + "status": "OK", + "steady": { + "duration_sec": 112.097913814, + "samples": 3, + "avg_temp_c": 76.66666666666667, + "p95_temp_c": 78.8, + "avg_power_w": 698.3166666666667, + "p95_power_w": 699.505, + "avg_graphics_clock_mhz": 1155, + "p95_graphics_clock_mhz": 1182, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 71.33333333333333, + "clock_cv_pct": 2.1207703400720157, + "power_cv_pct": 0.14121311398533568, + "temp_cv_pct": 2.6801800012908594, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1417.5994524991488, + "weighted_teraops_per_sec": 354.3998631247872, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp16", + "status": "OK", + "steady": { + "duration_sec": 170.809433267, + "samples": 3, + "avg_temp_c": 80.33333333333333, + "p95_temp_c": 80.9, + "avg_power_w": 698.3166666666666, + "p95_power_w": 698.579, + "avg_graphics_clock_mhz": 1150, + "p95_graphics_clock_mhz": 1167, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 76, + "clock_cv_pct": 1.2297509238026914, + "power_cv_pct": 0.033365917310299316, + "temp_cv_pct": 0.5868106067938154, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 605.7576061293909, + "weighted_teraops_per_sec": 302.87880306469543, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp32", + "status": "OK", + "steady": { + "duration_sec": 234.458143018, + "samples": 2, + "avg_temp_c": 81, + "p95_temp_c": 81, + "avg_power_w": 699.075, + "p95_power_w": 699.3315, + "avg_graphics_clock_mhz": 1185, + "p95_graphics_clock_mhz": 1185, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 100, + "clock_cv_pct": 0, + "power_cv_pct": 0.04076815792297321, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 329.0028834816, + "weighted_teraops_per_sec": 329.0028834816, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + } + ], + "cooldown": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "throttle_counters": { + "sw_power_cap_us": 100016, + "sw_thermal_slowdown_us": 100016, + "sync_boost_us": 0, + "hw_thermal_slowdown_us": 0, + "hw_power_brake_slowdown_us": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "precision_results": [ + { + "name": "fp16_tensor", + "category": "fp16_bf16", + "supported": true, + "lanes": 16, + "m": 10880, + "n": 10880, + "k": 10880, + "iterations": 25760, + "teraops_per_sec": 221.17767359146666, + "weight": 0.5, + "weighted_teraops_per_sec": 110.58883679573333 + }, + { + "name": "fp32_tf32", + "category": "fp32_tf32", + "supported": true, + "lanes": 16, + "m": 7680, + "n": 7680, + "k": 7680, + "iterations": 25760, + "teraops_per_sec": 77.7925951488, + "weight": 1, + "weighted_teraops_per_sec": 77.7925951488 + }, + { + "name": "fp64", + "category": "fp64", + "supported": false, + "weight": 2, + "notes": "benchmark_disabled" + }, + { + "name": "fp8_e4m3", + "category": "fp8", + "supported": true, + "lanes": 16, + "m": 12544, + "n": 12544, + "k": 12544, + "iterations": 25760, + "teraops_per_sec": 338.9711491355989, + "weight": 0.25, + "weighted_teraops_per_sec": 84.74278728389973 + }, + { + "name": "fp8_e5m2", + "category": "fp8", + "supported": false, + "weight": 0.25, + "notes": "unsupported" + }, + { + "name": "int8_tensor", + "category": "int8", + "supported": true, + "lanes": 16, + "m": 9728, + "n": 9728, + "k": 9728, + "iterations": 25760, + "teraops_per_sec": 158.09760300018348, + "weight": 0.25, + "weighted_teraops_per_sec": 39.52440075004587 + } + ], + "scores": { + "compute_score": 1367.3721922811767, + "synthetic_score": 1273.577606287633, + "mixed_score": 312.6486199784789, + "mixed_efficiency": 0.2454884715583389, + "power_sustain_score": 74.83848960808605, + "thermal_sustain_score": 74.77549203698788, + "stability_score": 99.97205731684326, + "thermal_throttle_pct": 0.013971341578371458, + "power_cap_throttle_pct": 0.013971341578371458, + "temp_headroom_c": 5, + "interconnect_score": 8.55, + "server_quality_score": 84.87301742025949, + "composite_score": 1367.3721922811767 + }, + "degradation_reasons": [ + "low_sm_clock_vs_target" + ] + }, + { + "index": 1, + "uuid": "GPU-30b320aa-c18f-6b25-d0ed-24aeb14f1fd3", + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:5A:00.0", + "vbios": "96.00.74.00.01", + "compute_capability": "9.0", + "backend": "cublasLt", + "status": "OK", + "power_limit_w": 700, + "default_power_limit_w": 700, + "max_graphics_clock_mhz": 1980, + "max_memory_clock_mhz": 2619, + "locked_graphics_clock_mhz": 1980, + "locked_memory_clock_mhz": 2619, + "baseline": { + "duration_sec": 20.487515832, + "samples": 1, + "avg_temp_c": 71, + "p95_temp_c": 71, + "avg_power_w": 173.85, + "p95_power_w": 173.85, + "avg_graphics_clock_mhz": 1830, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "steady": { + "duration_sec": 715.865398029, + "samples": 21, + "avg_temp_c": 82.61904761904762, + "p95_temp_c": 85, + "avg_power_w": 634.0671428571428, + "p95_power_w": 698.06, + "avg_graphics_clock_mhz": 1260, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 85.71428571428571, + "avg_mem_usage_pct": 52.714285714285715, + "clock_cv_pct": 18.78719361094068, + "power_cv_pct": 23.244958976113367, + "temp_cv_pct": 5.633692511025891, + "clock_drift_pct": 0 + }, + "precision_steady": [ + { + "precision": "int8", + "status": "OK", + "steady": { + "duration_sec": 51.349862924, + "samples": 2, + "avg_temp_c": 68.5, + "p95_temp_c": 72.55, + "avg_power_w": 472.435, + "p95_power_w": 675.3625, + "avg_graphics_clock_mhz": 1492.5, + "p95_graphics_clock_mhz": 1796.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 50, + "avg_mem_usage_pct": 41.5, + "clock_cv_pct": 22.613065326633166, + "power_cv_pct": 47.72614222062294, + "temp_cv_pct": 6.569343065693431, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1146.6986080960512, + "weighted_teraops_per_sec": 286.6746520240128, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp8", + "status": "OK", + "steady": { + "duration_sec": 112.097913814, + "samples": 3, + "avg_temp_c": 78.66666666666667, + "p95_temp_c": 79.9, + "avg_power_w": 697.9699999999999, + "p95_power_w": 698.582, + "avg_graphics_clock_mhz": 1085, + "p95_graphics_clock_mhz": 1134, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 67, + "clock_cv_pct": 3.9642051921855423, + "power_cv_pct": 0.08413735466916311, + "temp_cv_pct": 1.5854480452431954, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1414.9347918741505, + "weighted_teraops_per_sec": 353.7336979685376, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp16", + "status": "OK", + "steady": { + "duration_sec": 170.809433267, + "samples": 3, + "avg_temp_c": 82, + "p95_temp_c": 82, + "avg_power_w": 698.6133333333333, + "p95_power_w": 698.958, + "avg_graphics_clock_mhz": 1095, + "p95_graphics_clock_mhz": 1095, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 74.66666666666667, + "clock_cv_pct": 0, + "power_cv_pct": 0.04337475447501179, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 605.7576061293909, + "weighted_teraops_per_sec": 302.87880306469543, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp32", + "status": "OK", + "steady": { + "duration_sec": 234.458143018, + "samples": 2, + "avg_temp_c": 81, + "p95_temp_c": 81, + "avg_power_w": 649.905, + "p95_power_w": 662.7615000000001, + "avg_graphics_clock_mhz": 1072.5, + "p95_graphics_clock_mhz": 1079.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 99, + "clock_cv_pct": 0.6993006993006993, + "power_cv_pct": 2.1980135558273943, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 328.3913539584, + "weighted_teraops_per_sec": 328.3913539584, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + } + ], + "cooldown": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "throttle_counters": { + "sw_power_cap_us": 99988, + "sw_thermal_slowdown_us": 99988, + "sync_boost_us": 0, + "hw_thermal_slowdown_us": 0, + "hw_power_brake_slowdown_us": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "precision_results": [ + { + "name": "fp16_tensor", + "category": "fp16_bf16", + "supported": true, + "lanes": 16, + "m": 10880, + "n": 10880, + "k": 10880, + "iterations": 24624, + "teraops_per_sec": 211.42387556352, + "weight": 0.5, + "weighted_teraops_per_sec": 105.71193778176 + }, + { + "name": "fp32_tf32", + "category": "fp32_tf32", + "supported": true, + "lanes": 16, + "m": 7680, + "n": 7680, + "k": 7680, + "iterations": 24624, + "teraops_per_sec": 74.36199002112, + "weight": 1, + "weighted_teraops_per_sec": 74.36199002112 + }, + { + "name": "fp64", + "category": "fp64", + "supported": false, + "weight": 2, + "notes": "benchmark_disabled" + }, + { + "name": "fp8_e4m3", + "category": "fp8", + "supported": true, + "lanes": 16, + "m": 12544, + "n": 12544, + "k": 12544, + "iterations": 24624, + "teraops_per_sec": 324.02273199980544, + "weight": 0.25, + "weighted_teraops_per_sec": 81.00568299995136 + }, + { + "name": "fp8_e5m2", + "category": "fp8", + "supported": false, + "weight": 0.25, + "notes": "unsupported" + }, + { + "name": "int8_tensor", + "category": "int8", + "supported": true, + "lanes": 16, + "m": 9728, + "n": 9728, + "k": 9728, + "iterations": 24624, + "teraops_per_sec": 151.1255969051443, + "weight": 0.25, + "weighted_teraops_per_sec": 37.78139922628608 + } + ], + "scores": { + "compute_score": 1361.336810024381, + "synthetic_score": 1271.678507015646, + "mixed_score": 298.86101002911744, + "mixed_efficiency": 0.23501302285156925, + "power_sustain_score": 73.25585575127133, + "thermal_sustain_score": 83.09892246692233, + "stability_score": 99.9720651395429, + "thermal_throttle_pct": 0.013967430228545486, + "power_cap_throttle_pct": 0.013967430228545486, + "temp_headroom_c": 5, + "interconnect_score": 8.55, + "server_quality_score": 86.89525952127526, + "composite_score": 1361.336810024381 + }, + "degradation_reasons": [ + "low_sm_clock_vs_target" + ] + }, + { + "index": 2, + "uuid": "GPU-0182c11c-0c2c-aafd-0ada-113b64147ee6", + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:C8:00.0", + "vbios": "96.00.74.00.01", + "compute_capability": "9.0", + "backend": "cublasLt", + "status": "OK", + "power_limit_w": 700, + "default_power_limit_w": 700, + "max_graphics_clock_mhz": 1980, + "max_memory_clock_mhz": 2619, + "locked_graphics_clock_mhz": 1980, + "locked_memory_clock_mhz": 2619, + "baseline": { + "duration_sec": 20.487515832, + "samples": 1, + "avg_temp_c": 66, + "p95_temp_c": 66, + "avg_power_w": 145.93, + "p95_power_w": 145.93, + "avg_graphics_clock_mhz": 1830, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "steady": { + "duration_sec": 715.865398029, + "samples": 21, + "avg_temp_c": 80.61904761904762, + "p95_temp_c": 83, + "avg_power_w": 636.3133333333332, + "p95_power_w": 701.39, + "avg_graphics_clock_mhz": 1260, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 85.71428571428571, + "avg_mem_usage_pct": 50.19047619047619, + "clock_cv_pct": 18.653808797627907, + "power_cv_pct": 23.841801321120794, + "temp_cv_pct": 6.969375376219228, + "clock_drift_pct": 0 + }, + "precision_steady": [ + { + "precision": "int8", + "status": "OK", + "steady": { + "duration_sec": 51.349862924, + "samples": 2, + "avg_temp_c": 66.5, + "p95_temp_c": 70.55, + "avg_power_w": 471.405, + "p95_power_w": 676.0065, + "avg_graphics_clock_mhz": 1500, + "p95_graphics_clock_mhz": 1797, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 50, + "avg_mem_usage_pct": 41.5, + "clock_cv_pct": 22, + "power_cv_pct": 48.224987006926106, + "temp_cv_pct": 6.7669172932330826, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1147.527147552768, + "weighted_teraops_per_sec": 286.881786888192, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp8", + "status": "OK", + "steady": { + "duration_sec": 112.097913814, + "samples": 3, + "avg_temp_c": 76, + "p95_temp_c": 77.8, + "avg_power_w": 699.94, + "p95_power_w": 701.489, + "avg_graphics_clock_mhz": 1140, + "p95_graphics_clock_mhz": 1153.5, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 68.33333333333333, + "clock_cv_pct": 1.07433760648385, + "power_cv_pct": 0.18365617738503892, + "temp_cv_pct": 2.1486752129677, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1414.9347918741505, + "weighted_teraops_per_sec": 353.7336979685376, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp16", + "status": "OK", + "steady": { + "duration_sec": 170.809433267, + "samples": 3, + "avg_temp_c": 79.66666666666667, + "p95_temp_c": 80, + "avg_power_w": 698.0733333333333, + "p95_power_w": 698.9, + "avg_graphics_clock_mhz": 1140, + "p95_graphics_clock_mhz": 1191, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 75, + "clock_cv_pct": 3.7216146378239348, + "power_cv_pct": 0.14423331175591858, + "temp_cv_pct": 0.5917211558046422, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 605.7576061293909, + "weighted_teraops_per_sec": 302.87880306469543, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp32", + "status": "OK", + "steady": { + "duration_sec": 234.458143018, + "samples": 2, + "avg_temp_c": 80, + "p95_temp_c": 80, + "avg_power_w": 699.12, + "p95_power_w": 699.237, + "avg_graphics_clock_mhz": 1155, + "p95_graphics_clock_mhz": 1155, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 100, + "clock_cv_pct": 0, + "power_cv_pct": 0.018594804897584888, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 327.9836676096, + "weighted_teraops_per_sec": 327.9836676096, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + } + ], + "cooldown": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "throttle_counters": { + "sw_power_cap_us": 200011, + "sw_thermal_slowdown_us": 200012, + "sync_boost_us": 0, + "hw_thermal_slowdown_us": 0, + "hw_power_brake_slowdown_us": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "precision_results": [ + { + "name": "fp16_tensor", + "category": "fp16_bf16", + "supported": true, + "lanes": 16, + "m": 10880, + "n": 10880, + "k": 10880, + "iterations": 24640, + "teraops_per_sec": 211.56125300053336, + "weight": 0.5, + "weighted_teraops_per_sec": 105.78062650026668 + }, + { + "name": "fp32_tf32", + "category": "fp32_tf32", + "supported": true, + "lanes": 16, + "m": 7680, + "n": 7680, + "k": 7680, + "iterations": 24640, + "teraops_per_sec": 74.4103084032, + "weight": 1, + "weighted_teraops_per_sec": 74.4103084032 + }, + { + "name": "fp64", + "category": "fp64", + "supported": false, + "weight": 2, + "notes": "benchmark_disabled" + }, + { + "name": "fp8_e4m3", + "category": "fp8", + "supported": true, + "lanes": 16, + "m": 12544, + "n": 12544, + "k": 12544, + "iterations": 24640, + "teraops_per_sec": 324.23327308622504, + "weight": 0.25, + "weighted_teraops_per_sec": 81.05831827155626 + }, + { + "name": "fp8_e5m2", + "category": "fp8", + "supported": false, + "weight": 0.25, + "notes": "unsupported" + }, + { + "name": "int8_tensor", + "category": "int8", + "supported": true, + "lanes": 16, + "m": 9728, + "n": 9728, + "k": 9728, + "iterations": 24640, + "teraops_per_sec": 151.22379417408854, + "weight": 0.25, + "weighted_teraops_per_sec": 37.805948543522135 + } + ], + "scores": { + "compute_score": 1361.1945160465887, + "synthetic_score": 1271.477955531025, + "mixed_score": 299.05520171854505, + "mixed_efficiency": 0.2352028207942044, + "power_sustain_score": 74.7401367794711, + "thermal_sustain_score": 79.09187387134232, + "stability_score": 99.94412036102018, + "thermal_throttle_pct": 0.027939889335438648, + "power_cap_throttle_pct": 0.02793974964437344, + "temp_headroom_c": 7, + "interconnect_score": 8.55, + "server_quality_score": 86.1272513396521, + "composite_score": 1361.1945160465887 + }, + "degradation_reasons": [ + "low_sm_clock_vs_target" + ] + }, + { + "index": 3, + "uuid": "GPU-9ee0af22-3dda-6f5c-1a13-35c63f324216", + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:D8:00.0", + "vbios": "96.00.74.00.01", + "compute_capability": "9.0", + "backend": "cublasLt", + "status": "OK", + "power_limit_w": 700, + "default_power_limit_w": 700, + "max_graphics_clock_mhz": 1980, + "max_memory_clock_mhz": 2619, + "locked_graphics_clock_mhz": 1980, + "locked_memory_clock_mhz": 2619, + "baseline": { + "duration_sec": 20.487515832, + "samples": 1, + "avg_temp_c": 64, + "p95_temp_c": 64, + "avg_power_w": 112.32, + "p95_power_w": 112.32, + "avg_graphics_clock_mhz": 1980, + "p95_graphics_clock_mhz": 1980, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "steady": { + "duration_sec": 715.865398029, + "samples": 21, + "avg_temp_c": 78.95238095238095, + "p95_temp_c": 82, + "avg_power_w": 615.5780952380953, + "p95_power_w": 699.76, + "avg_graphics_clock_mhz": 1307.142857142857, + "p95_graphics_clock_mhz": 1830, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 80.95238095238095, + "avg_mem_usage_pct": 47.476190476190474, + "clock_cv_pct": 19.6037061362318, + "power_cv_pct": 28.346418909802996, + "temp_cv_pct": 8.351205756656684, + "clock_drift_pct": 0 + }, + "precision_steady": [ + { + "precision": "int8", + "status": "OK", + "steady": { + "duration_sec": 51.349862924, + "samples": 2, + "avg_temp_c": 64.5, + "p95_temp_c": 68.55, + "avg_power_w": 466.385, + "p95_power_w": 675.7744999999999, + "avg_graphics_clock_mhz": 1507.5, + "p95_graphics_clock_mhz": 1797.75, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 50, + "avg_mem_usage_pct": 41.5, + "clock_cv_pct": 21.393034825870647, + "power_cv_pct": 49.88475186809181, + "temp_cv_pct": 6.976744186046512, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1147.1128778244095, + "weighted_teraops_per_sec": 286.77821945610236, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp8", + "status": "OK", + "steady": { + "duration_sec": 112.097913814, + "samples": 3, + "avg_temp_c": 75, + "p95_temp_c": 76.8, + "avg_power_w": 698.8933333333334, + "p95_power_w": 699.885, + "avg_graphics_clock_mhz": 1140, + "p95_graphics_clock_mhz": 1167, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 66.66666666666667, + "clock_cv_pct": 2.1486752129677, + "power_cv_pct": 0.11879844120756695, + "temp_cv_pct": 2.1773242158072694, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 1415.8230120824833, + "weighted_teraops_per_sec": 353.9557530206208, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp16", + "status": "OK", + "steady": { + "duration_sec": 170.809433267, + "samples": 3, + "avg_temp_c": 78.66666666666667, + "p95_temp_c": 79, + "avg_power_w": 698.1666666666666, + "p95_power_w": 698.7339999999999, + "avg_graphics_clock_mhz": 1145, + "p95_graphics_clock_mhz": 1167, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 74.66666666666667, + "clock_cv_pct": 1.6339115226087082, + "power_cv_pct": 0.0725584540138954, + "temp_cv_pct": 0.5992430349038538, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 605.7576061293909, + "weighted_teraops_per_sec": 302.87880306469543, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + }, + { + "precision": "fp32", + "status": "OK", + "steady": { + "duration_sec": 234.458143018, + "samples": 2, + "avg_temp_c": 79, + "p95_temp_c": 79, + "avg_power_w": 698.85, + "p95_power_w": 698.967, + "avg_graphics_clock_mhz": 1162.5, + "p95_graphics_clock_mhz": 1169.25, + "avg_memory_clock_mhz": 2619, + "p95_memory_clock_mhz": 2619, + "avg_usage_pct": 100, + "avg_mem_usage_pct": 100, + "clock_cv_pct": 0.6451612903225806, + "power_cv_pct": 0.01860198898189818, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "teraops_per_sec": 328.3913539584, + "weighted_teraops_per_sec": 328.3913539584, + "ecc": { + "corrected": 0, + "uncorrected": 0 + } + } + ], + "cooldown": { + "duration_sec": 0, + "samples": 0, + "avg_temp_c": 0, + "p95_temp_c": 0, + "avg_power_w": 0, + "p95_power_w": 0, + "avg_graphics_clock_mhz": 0, + "p95_graphics_clock_mhz": 0, + "avg_memory_clock_mhz": 0, + "p95_memory_clock_mhz": 0, + "avg_usage_pct": 0, + "avg_mem_usage_pct": 0, + "clock_cv_pct": 0, + "power_cv_pct": 0, + "temp_cv_pct": 0, + "clock_drift_pct": 0 + }, + "throttle_counters": { + "sw_power_cap_us": 200033, + "sw_thermal_slowdown_us": 200033, + "sync_boost_us": 0, + "hw_thermal_slowdown_us": 0, + "hw_power_brake_slowdown_us": 0 + }, + "ecc": { + "corrected": 0, + "uncorrected": 0 + }, + "precision_results": [ + { + "name": "fp16_tensor", + "category": "fp16_bf16", + "supported": true, + "lanes": 16, + "m": 10880, + "n": 10880, + "k": 10880, + "iterations": 24640, + "teraops_per_sec": 211.56125300053336, + "weight": 0.5, + "weighted_teraops_per_sec": 105.78062650026668 + }, + { + "name": "fp32_tf32", + "category": "fp32_tf32", + "supported": true, + "lanes": 16, + "m": 7680, + "n": 7680, + "k": 7680, + "iterations": 24640, + "teraops_per_sec": 74.4103084032, + "weight": 1, + "weighted_teraops_per_sec": 74.4103084032 + }, + { + "name": "fp64", + "category": "fp64", + "supported": false, + "weight": 2, + "notes": "benchmark_disabled" + }, + { + "name": "fp8_e4m3", + "category": "fp8", + "supported": true, + "lanes": 16, + "m": 12544, + "n": 12544, + "k": 12544, + "iterations": 24640, + "teraops_per_sec": 324.23327308622504, + "weight": 0.25, + "weighted_teraops_per_sec": 81.05831827155626 + }, + { + "name": "fp8_e5m2", + "category": "fp8", + "supported": false, + "weight": 0.25, + "notes": "unsupported" + }, + { + "name": "int8_tensor", + "category": "int8", + "supported": true, + "lanes": 16, + "m": 9728, + "n": 9728, + "k": 9728, + "iterations": 24640, + "teraops_per_sec": 151.22379417408854, + "weight": 0.25, + "weighted_teraops_per_sec": 37.805948543522135 + } + ], + "scores": { + "compute_score": 1361.7206900153822, + "synthetic_score": 1272.0041294998186, + "mixed_score": 299.05520171854505, + "mixed_efficiency": 0.23510552739804425, + "power_sustain_score": 74.84253083684747, + "thermal_sustain_score": 74.94638273002994, + "stability_score": 99.94411435430439, + "thermal_throttle_pct": 0.027942822847808132, + "power_cap_throttle_pct": 0.027942822847808132, + "temp_headroom_c": 8, + "interconnect_score": 8.55, + "server_quality_score": 84.91431981178498, + "composite_score": 1361.7206900153822 + }, + "degradation_reasons": [ + "low_sm_clock_vs_target" + ] + } + ], + "interconnect": { + "status": "OK", + "attempted": true, + "supported": true, + "selected_gpu_indices": [ + 0, + 1, + 2, + 3 + ], + "avg_algbw_gbps": 5.61, + "max_algbw_gbps": 5.7, + "avg_busbw_gbps": 8.415, + "max_busbw_gbps": 8.55 + }, + "server_power": { + "available": false, + "notes": [ + "IPMI power reading unavailable; server-side power characterization skipped" + ] + } +} \ No newline at end of file diff --git a/power/power-20260416-133649/result.json b/power/power-20260416-133649/result.json new file mode 100644 index 0000000..8c94277 --- /dev/null +++ b/power/power-20260416-133649/result.json @@ -0,0 +1,143 @@ +{ + "benchmark_version": "2", + "generated_at": "2026-04-16T13:36:50.097550616Z", + "hostname": "debian", + "server_model": "MLT-S06", + "benchmark_profile": "standard", + "selected_gpu_indices": [ + 0, + 1, + 2, + 3 + ], + "recommended_slot_order": [ + 2, + 3, + 1, + 0 + ], + "ramp_steps": [ + { + "step_index": 1, + "gpu_indices": [ + 2 + ], + "new_gpu_index": 2, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 699.6875, + "avg_observed_power_w": 699.6875, + "status": "OK" + }, + { + "step_index": 2, + "gpu_indices": [ + 2, + 3 + ], + "new_gpu_index": 3, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 1399.7424999999998, + "avg_observed_power_w": 699.8712499999999, + "status": "OK" + }, + { + "step_index": 3, + "gpu_indices": [ + 2, + 3, + 1 + ], + "new_gpu_index": 1, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 1997.168, + "avg_observed_power_w": 665.7226666666667, + "status": "PARTIAL", + "notes": [ + "GPU 2 was re-derated from 700 W to 650 W under combined thermal load.", + "GPU 3 was re-derated from 700 W to 650 W under combined thermal load." + ] + }, + { + "step_index": 4, + "gpu_indices": [ + 2, + 3, + 1, + 0 + ], + "new_gpu_index": 0, + "new_gpu_stable_limit_w": 700, + "total_observed_power_w": 2687.2, + "avg_observed_power_w": 671.8, + "status": "PARTIAL", + "notes": [ + "GPU 1 was re-derated from 700 W to 690 W under combined thermal load." + ] + } + ], + "overall_status": "PARTIAL", + "platform_max_tdp_w": 2690, + "server_power": { + "available": true, + "idle_w": 647.3333333333334, + "loaded_w": 1364.7002314814815, + "delta_w": 717.3668981481482, + "gpu_reported_sum_w": 2690, + "reporting_ratio": 0.2666791442929919 + }, + "findings": [ + "Recommended slot order for installation based on single-card targeted_power: 2,3,1,0.", + "GPU 2 required additional derating from 700 W (single-card) to 650 W under full-system thermal load.", + "GPU 3 required additional derating from 700 W (single-card) to 650 W under full-system thermal load.", + "GPU 1 required additional derating from 700 W (single-card) to 690 W under full-system thermal load." + ], + "gpus": [ + { + "index": 2, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:C8:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 650, + "max_observed_power_w": 699.6875, + "max_observed_temp_c": 79, + "calibration_attempts": 1, + "derated": true, + "status": "OK" + }, + { + "index": 3, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:D8:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 650, + "max_observed_power_w": 699.325, + "max_observed_temp_c": 76.5, + "calibration_attempts": 1, + "derated": true, + "status": "OK" + }, + { + "index": 1, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:5A:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 690, + "max_observed_power_w": 698.948, + "max_observed_temp_c": 75.8, + "calibration_attempts": 1, + "derated": true, + "status": "OK" + }, + { + "index": 0, + "name": "NVIDIA H100 80GB HBM3", + "bus_id": "00000000:49:00.0", + "applied_power_limit_w": 700, + "stable_power_limit_w": 700, + "max_observed_power_w": 698.496, + "max_observed_temp_c": 77, + "calibration_attempts": 1, + "status": "OK" + } + ] +} \ No newline at end of file