one more pass add

This commit is contained in:
Mikhail Chusavitin
2026-04-16 13:04:34 +03:00
parent 13eb0e5548
commit b4990a85b3
3 changed files with 1462 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,124 @@
{
"benchmark_version": "2",
"generated_at": "2026-04-16T07:39:31.70304016Z",
"hostname": "debian",
"server_model": "MLT-S06",
"benchmark_profile": "standard",
"selected_gpu_indices": [
0,
1,
2,
3
],
"recommended_slot_order": [
3,
2,
1,
0
],
"ramp_steps": [
{
"step_index": 1,
"gpu_indices": [
3
],
"new_gpu_index": 3,
"new_gpu_stable_limit_w": 700,
"total_observed_power_w": 699.746,
"avg_observed_power_w": 699.746,
"status": "OK"
},
{
"step_index": 2,
"gpu_indices": [
3,
2
],
"new_gpu_index": 2,
"new_gpu_stable_limit_w": 700,
"total_observed_power_w": 698.9375,
"avg_observed_power_w": 349.46875,
"status": "OK"
},
{
"step_index": 3,
"gpu_indices": [
3,
2,
1
],
"new_gpu_index": 1,
"new_gpu_stable_limit_w": 700,
"total_observed_power_w": 698.9350000000001,
"avg_observed_power_w": 232.97833333333335,
"status": "OK"
},
{
"step_index": 4,
"gpu_indices": [
3,
2,
1,
0
],
"new_gpu_index": 0,
"new_gpu_stable_limit_w": 575,
"total_observed_power_w": 574.178,
"avg_observed_power_w": 143.5445,
"derated": true,
"status": "PARTIAL"
}
],
"overall_status": "PARTIAL",
"platform_max_tdp_w": 2675,
"findings": [
"Recommended slot order for installation based on single-card targeted_power: 3,2,1,0.",
"Ramp step 4 (GPU 0) required derating to 575 W under combined thermal load."
],
"gpus": [
{
"index": 3,
"name": "NVIDIA H100 80GB HBM3",
"bus_id": "00000000:D8:00.0",
"applied_power_limit_w": 700,
"stable_power_limit_w": 700,
"max_observed_power_w": 699.746,
"max_observed_temp_c": 73.8,
"calibration_attempts": 1,
"status": "OK"
},
{
"index": 2,
"name": "NVIDIA H100 80GB HBM3",
"bus_id": "00000000:C8:00.0",
"applied_power_limit_w": 700,
"stable_power_limit_w": 700,
"max_observed_power_w": 699.2909999999999,
"max_observed_temp_c": 78.1,
"calibration_attempts": 1,
"status": "OK"
},
{
"index": 1,
"name": "NVIDIA H100 80GB HBM3",
"bus_id": "00000000:5A:00.0",
"applied_power_limit_w": 700,
"stable_power_limit_w": 700,
"max_observed_power_w": 698.86,
"max_observed_temp_c": 78.6,
"calibration_attempts": 1,
"status": "OK"
},
{
"index": 0,
"name": "NVIDIA H100 80GB HBM3",
"bus_id": "00000000:49:00.0",
"applied_power_limit_w": 700,
"stable_power_limit_w": 575,
"max_observed_power_w": 697.712,
"max_observed_temp_c": 76.8,
"calibration_attempts": 1,
"status": "OK"
}
]
}

View File

@@ -0,0 +1,132 @@
{
"benchmark_version": "2",
"generated_at": "2026-04-16T08:16:28.673449017Z",
"hostname": "debian",
"server_model": "MLT-S06",
"benchmark_profile": "stability",
"selected_gpu_indices": [
0,
1,
2,
3
],
"recommended_slot_order": [
1,
2,
3,
0
],
"ramp_steps": [
{
"step_index": 1,
"gpu_indices": [
1
],
"new_gpu_index": 1,
"new_gpu_stable_limit_w": 700,
"total_observed_power_w": 699.1899999999999,
"avg_observed_power_w": 699.1899999999999,
"status": "OK"
},
{
"step_index": 2,
"gpu_indices": [
1,
2
],
"new_gpu_index": 2,
"new_gpu_stable_limit_w": 700,
"total_observed_power_w": 698.405,
"avg_observed_power_w": 349.2025,
"status": "OK"
},
{
"step_index": 3,
"gpu_indices": [
1,
2,
3
],
"new_gpu_index": 3,
"new_gpu_stable_limit_w": 700,
"total_observed_power_w": 699.54,
"avg_observed_power_w": 233.17999999999998,
"status": "OK"
},
{
"step_index": 4,
"gpu_indices": [
1,
2,
3,
0
],
"new_gpu_index": 0,
"new_gpu_stable_limit_w": 700,
"total_observed_power_w": 669.28,
"avg_observed_power_w": 167.32,
"status": "OK"
}
],
"overall_status": "PARTIAL",
"platform_max_tdp_w": 2800,
"findings": [
"Recommended slot order for installation based on single-card targeted_power: 1,2,3,0.",
"GPU 0 required reduced power limit 670 W to complete targeted_power."
],
"gpus": [
{
"index": 1,
"name": "NVIDIA H100 80GB HBM3",
"bus_id": "00000000:5A:00.0",
"applied_power_limit_w": 700,
"stable_power_limit_w": 700,
"max_observed_power_w": 699.1899999999999,
"max_observed_temp_c": 78.75,
"calibration_attempts": 1,
"status": "OK"
},
{
"index": 2,
"name": "NVIDIA H100 80GB HBM3",
"bus_id": "00000000:C8:00.0",
"applied_power_limit_w": 700,
"stable_power_limit_w": 700,
"max_observed_power_w": 699.088,
"max_observed_temp_c": 77.4,
"calibration_attempts": 1,
"status": "OK"
},
{
"index": 3,
"name": "NVIDIA H100 80GB HBM3",
"bus_id": "00000000:D8:00.0",
"applied_power_limit_w": 700,
"stable_power_limit_w": 700,
"max_observed_power_w": 698.5699999999999,
"max_observed_temp_c": 74.95,
"calibration_attempts": 1,
"status": "OK"
},
{
"index": 0,
"name": "NVIDIA H100 80GB HBM3",
"bus_id": "00000000:49:00.0",
"applied_power_limit_w": 670,
"stable_power_limit_w": 700,
"max_observed_power_w": 668.956,
"max_observed_temp_c": 82.19999999999999,
"calibration_attempts": 5,
"derated": true,
"status": "PARTIAL",
"notes": [
"targeted_power attempt 1: sw_thermal throttle at 700 W",
"binary search: trying 625 W (lo=550 hi=700)",
"binary search: stable at 625 W, trying 660 W (lo=625 hi=700)",
"binary search: stable at 660 W, trying 680 W (lo=660 hi=700)",
"targeted_power attempt 4: sw_thermal throttle at 680 W",
"binary search: trying 670 W (lo=660 hi=680)"
]
}
]
}