From 3cf1989cd79079a1b4b987aee6e44263b5efa6f1 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Wed, 17 Mar 2021 15:42:29 +0800
Subject: [PATCH 01/45] Update

---
 paddle/fluid/string/tinyformat/tinyformat.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/string/tinyformat/tinyformat.h b/paddle/fluid/string/tinyformat/tinyformat.h
index a5c1798e10..7498c6a46e 100644
--- a/paddle/fluid/string/tinyformat/tinyformat.h
+++ b/paddle/fluid/string/tinyformat/tinyformat.h
@@ -777,7 +777,7 @@ inline void formatImpl(std::ostream &out, const char *fmt,
 
   // Print remaining part of format string.
   fmt = printFormatStringLiteral(out, fmt);
-  if (*fmt != '\0')
+  if (fmt != nullptr && *fmt != '\0' && *fmt != 0)
     TINYFORMAT_ERROR(
         "tinyformat: Too many conversion specifiers in format string");
 
-- 
Gitee


From 4c2766cb5ce16400042c5dd85811ebb9bea8066f Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Wed, 17 Mar 2021 19:16:58 +0800
Subject: [PATCH 02/45] Optimize

---
 CMakeLists.txt               |  7 +++++++
 cmake/external/warpctc.cmake | 12 ++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f24513d605..992c3f1c4f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,13 @@ if(WIN32)
     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")
 
+    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
+        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
+        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
+        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
+    endif()
+
     if (MSVC_STATIC_CRT)
         message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
         set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /MTd")
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 0ee3e2116a..e633cae540 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -49,12 +49,12 @@ ExternalProject_Add(
     BUILD_ALWAYS    1
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS=$<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>
+                    -DCMAKE_C_FLAGS_DEBUG=$<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
+                    -DCMAKE_C_FLAGS_RELEASE=$<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
+                    -DCMAKE_CXX_FLAGS=$<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>
+                    -DCMAKE_CXX_FLAGS_RELEASE=$<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
+                    -DCMAKE_CXX_FLAGS_DEBUG=$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
                     -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
                     -DWITH_GPU=${WITH_GPU}
                     -DWITH_OMP=${USE_OMP}
-- 
Gitee


From 89e393cf6ce416194a09897636ad686437e85cfc Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Fri, 19 Mar 2021 17:06:12 +0800
Subject: [PATCH 03/45] Second

---
 paddle/scripts/paddle_build.sh | 14 ++++++++++++++
 tools/windows/run_unittests.sh |  6 ++++++
 2 files changed, 20 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 3b20a403b7..3fd93a664d 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -637,6 +637,13 @@ EOF
                     do
                         retry_unittests_record="$retry_unittests_record$failed_test_lists"
                         failed_test_lists_ult=`echo "${failed_test_lists}"`
+                        if [[ "${exec_times}" == "1" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
                         echo "========================================="
                         echo "This is the ${exec_time_array[$exec_times]} time to re-run"
                         echo "========================================="
@@ -1250,6 +1257,13 @@ set +x
                     do
                         retry_unittests_record="$retry_unittests_record$failed_test_lists"
                         failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        if [[ "${exec_times}" == "1" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
                         echo "========================================="
                         echo "This is the ${exec_time_array[$exec_times]} time to re-run"
                         echo "========================================="
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 312711c514..dd4b21c80d 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -305,6 +305,12 @@ function unittests_retry(){
                         cur_order='first'
                     elif ( [[ "$exec_times" == "1" ]] );then
                         cur_order='second'
+                        if [[ "$failed_test_lists" == "" ]]; then
+                            break
+                        else
+                            retry_unittests=$(echo "${failed_test_lists}" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+                            retry_unittests_regular=$(echo "$retry_unittests" |awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}')
+                        fi
                     elif ( [[ "$exec_times" == "2" ]] );then
                         cur_order='third'
                     fi
-- 
Gitee


From 029c26cafd665825e16bbe9f3acf7d7e9ab8fbe3 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 15:06:27 +0800
Subject: [PATCH 04/45] [ROCM]

---
 paddle/scripts/paddle_build.sh   | 36 +++++++++++++++++++++++++++++---
 tools/dockerfile/Dockerfile.rocm | 16 +++++++-------
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 3b20a403b7..7a360ac229 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -205,6 +205,13 @@ function cmake_base() {
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
                 pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
+           elif [ "$1" == "conda-python3.7" ]; then
+                export LD_LIBRARY_PATH=/opt/conda/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/conda/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/conda/bin/python
+                                     -DPYTHON_INCLUDE_DIR:PATH=/opt/conda/include/python3.7m
+                                     -DPYTHON_LIBRARIES:FILEPATH=/opt/conda/lib/libpython3.so"
+                /opt/conda/bin/pip install -r ${PADDLE_ROOT}/python/requirements.txt
            fi
         else
             pip install -r ${PADDLE_ROOT}/python/requirements.txt
@@ -230,7 +237,8 @@ function cmake_base() {
         ${PYTHON_FLAGS}
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_TENSORRT=${WITH_TENSORRT:-ON}
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
+        -DWITH_ROCM=${WITH_ROCM:-OFF}
+        -DWITH_RCCL=${WITH_RCCL:-OFF}
         -DWITH_DISTRIBUTE=${distibuted_flag}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
@@ -267,7 +275,8 @@ EOF
         ${PYTHON_FLAGS} \
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_TENSORRT=${WITH_TENSORRT:-ON} \
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
+        -DWITH_ROCM=${WITH_ROCM:-OFF} \
+        -DWITH_RCCL=${WITH_RCCL:-OFF} \
         -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -637,6 +646,13 @@ EOF
                     do
                         retry_unittests_record="$retry_unittests_record$failed_test_lists"
                         failed_test_lists_ult=`echo "${failed_test_lists}"`
+                        if [[ "${exec_times}" == "1" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
                         echo "========================================="
                         echo "This is the ${exec_time_array[$exec_times]} time to re-run"
                         echo "========================================="
@@ -1021,6 +1037,8 @@ function card_test() {
     # get the CUDA device count, XPU device count is one
     if [ "${WITH_XPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
+    elif [ "${WITH_ROCM}" == "ON" ];then
+        CUDA_DEVICE_COUNT=4
     else
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
@@ -1250,6 +1268,13 @@ set +x
                     do
                         retry_unittests_record="$retry_unittests_record$failed_test_lists"
                         failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        if [[ "${exec_times}" == "1" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
                         echo "========================================="
                         echo "This is the ${exec_time_array[$exec_times]} time to re-run"
                         echo "========================================="
@@ -1409,7 +1434,7 @@ function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
-    if [ "$WITH_GPU" == "ON" ];then
+    if [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then
         parallel_test_base_gpu
     else
         if [ "$WITH_XPU" == "ON" ];then
@@ -1968,6 +1993,11 @@ function main() {
         parallel_test
         check_coverage
         ;;
+      check_rocm_coverage)
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        parallel_test
+        check_coverage
+        ;;
       cmake_gen)
         cmake_gen ${PYTHON_ABI:-""}
         ;;
diff --git a/tools/dockerfile/Dockerfile.rocm b/tools/dockerfile/Dockerfile.rocm
index eab4ef07c8..5df66b9ea6 100644
--- a/tools/dockerfile/Dockerfile.rocm
+++ b/tools/dockerfile/Dockerfile.rocm
@@ -5,7 +5,6 @@
 # Build: ROCM 4.0.1
 # cd Paddle/tools/dockerfile
 # docker build -f Dockerfile.rocm  \
-#        --build-arg ROCM_VERSION=4.0.1  \
 #        -t paddlepaddle/paddle-centos-rocm401-dev:latest .
 #
 # docker run -it --device=/dev/kfd --device=/dev/dri \
@@ -22,7 +21,7 @@ ENV LANGUAGE en_US.UTF-8
 RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlite-devel \
         zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
         make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel \
-        net-tools numactl-devel chrpath
+        net-tools numactl-devel chrpath screen initscripts
 
 # Install devtoolset-7
 RUN yum install -y yum-utils centos-release-scl && \
@@ -45,11 +44,10 @@ RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && \
 ENV PATH=/opt/cmake-3.16/bin:${PATH}
 
 # ROCM
-ARG ROCM_VERSION
 RUN yum install -y kmod wget openblas-devel epel-release
 RUN echo "[ROCm]" > /etc/yum.repos.d/rocm.repo && \
     echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo && \
-    echo "baseurl=http://repo.radeon.com/rocm/yum/${ROCM_VERSION}" >> /etc/yum.repos.d/rocm.repo && \
+    echo "baseurl=http://repo.radeon.com/rocm/yum/4.0.1" >> /etc/yum.repos.d/rocm.repo && \
     echo "enabled=1" >> /etc/yum.repos.d/rocm.repo && \
     echo "gpgcheck=0" >> /etc/yum.repos.d/rocm.repo
 RUN yum install -y rocm-dev rocm-utils rocfft miopen-hip rocblas hipsparse rocrand rccl hipcub rocthrust rocprofiler-dev roctracer-dev
@@ -89,10 +87,14 @@ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/p
     cd .. && rm -f protobuf-cpp-3.6.1.tar.gz && rm -rf protobuf-3.6.1
 
 # conda
-RUN cd /opt && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && chmod +x Miniconda3-latest-Linux-x86_64.sh
-RUN mkdir /opt/conda && ./Miniconda3-latest-Linux-x86_64.sh -b -f -p "/opt/conda" && rm -rf Miniconda3-latest-Linux-x86_64.sh
+ENV CONDA_FILE=Miniconda3-py37_4.9.2-Linux-x86_64.sh
+RUN cd /opt && wget https://repo.anaconda.com/miniconda/${CONDA_FILE} && chmod +x ${CONDA_FILE}
+RUN mkdir /opt/conda && ./${CONDA_FILE} -b -f -p "/opt/conda" && rm -rf ${CONDA_FILE}
 ENV PATH=/opt/conda/bin:${PATH}
-RUN conda init bash && conda install -n base jupyter 
+RUN conda init bash && conda install -n base jupyter jupyterlab
+
+# install pylint and pre-commit
+RUN /opt/conda/bin/pip install pre-commit pylint pytest astroid isort protocol PyGithub
 
 # install Paddle requirement
 RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
-- 
Gitee


From 7bc28db04fa5ea6cdd60d06dcf2a1b18b864981b Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 15:07:08 +0800
Subject: [PATCH 05/45] turn

---
 tools/get_pr_ut.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 001f380049..58d7d2c0d6 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -265,7 +265,8 @@ class PRChecker(object):
                         '.cu'):
                     if f.find('test_') != -1 or f.find('_test') != -1:
                         print('PREC {} need check new ut'.format(f))
-                        check_added_ut = True
+                        if current_system != "Windows":
+                            check_added_ut = True
                     elif self.is_only_comment(f):
                         ut_list.append('nomap_comment_placeholder')
                     else:
-- 
Gitee


From ed70cd137ad26a8a2cf97c950e4c5ef8dd48babc Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 17:12:41 +0800
Subject: [PATCH 06/45] support

---
 CMakeLists.txt   |  6 +++++-
 cmake/cuda.cmake | 12 ++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 992c3f1c4f..10b3b0aba4 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,7 @@ if(WIN32)
             endif()
         endforeach(flag_var)
     endif()
-
+    
     # NOTE(Avin0323): Less parallel count result in faster compilation.
     math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
     # windows build turn off warnings, use parallel compiling.
@@ -123,6 +123,10 @@ if(WIN32)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
 
+    foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
+        string(APPEND ${flag_var} "/ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+    endforeach(flag_var)
+
     if (WITH_WIN_DUMP_DBG)
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 2f4f5449f4..c4d1384312 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -91,7 +91,7 @@ function(select_nvcc_arch_flags out_variable)
 
   if(${CUDA_ARCH_NAME} STREQUAL "Manual")
     set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    set(CUDA_ARCH_PTX ""                        CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
     mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
   else()
     unset(CUDA_ARCH_BIN CACHE)
@@ -175,14 +175,22 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
+  set(paddle_known_gpu_archs "${paddle_known_gpu_archs11} 86")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()
 
 if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
-- 
Gitee


From a5d530214c1f042379e36618a163d37aff0fd219 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 17:13:06 +0800
Subject: [PATCH 07/45] Refine

---
 .../slim/quantization/imperative/qat.py       | 483 ++++++++++--------
 .../slim/quantization/imperative/utils.py     |  46 ++
 2 files changed, 303 insertions(+), 226 deletions(-)
 create mode 100644 python/paddle/fluid/contrib/slim/quantization/imperative/utils.py

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index afe8a3de66..04aec158ea 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -25,101 +25,99 @@ from paddle.fluid.executor import Executor
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Constant
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.nn import Linear, Conv2D, Conv2DTranspose, MaxPool2D, MaxPool1D, BatchNorm1D, BatchNorm2D, BatchNorm3D, SyncBatchNorm
+from paddle.nn import Linear, Conv2D, Conv2DTranspose, MaxPool2D, MaxPool1D
+from paddle.nn import BatchNorm1D, BatchNorm2D, BatchNorm3D, SyncBatchNorm
 from paddle.fluid.dygraph.nn import BatchNorm, Pool2D
 from paddle.fluid.io import load_inference_model, save_inference_model
-from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6, Tanh, Softmax, PReLU, Swish
+from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6
+from paddle.nn.layer.activation import Tanh, Softmax, PReLU, Swish
 from paddle.fluid.log_helper import get_logger
 from . import quant_nn
 from .. import quantization_pass
+from . import utils
 
-__all__ = ['ImperativeQuantAware', 'ImperativeCalcOutScale']
+__all__ = ['ImperativeQuantAware']
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
-_op_real_in_out_name = {
-    "conv2d": [["Input", "Filter"], ["Output"]],
-    "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
-    "pool2d": [["X"], ["Out"]],
-    "elementwise_add": [["X", "Y"], ["Out"]],
-    "softmax": [["X"], ["Out"]],
-    "relu": [["X"], ["Out"]],
-    "relu6": [["X"], ["Out"]],
-    "leaky_relu": [["X"], ["Out"]],
-    "prelu": [["X"], ["Out"]],
-    "tanh": [["X"], ["Out"]],
-    "batch_norm": [["X"], ["Y"]],
-    "sigmoid": [["X"], ["Out"]],
-    "swish": [["X"], ["Out"]],
-}
-
 
 class ImperativeQuantAware(object):
     """
-    Add the fake quant logic for given quantizable layers, namely add the quant_dequant
-    computational logic both for activation inputs and weight inputs.
+    Applying quantization aware training (QAT) to dgraph model.
     """
 
     def __init__(self,
-                 weight_bits=8,
-                 activation_bits=8,
+                 quantizable_layer_type=['Conv2D', 'Linear'],
                  weight_quantize_type='abs_max',
                  activation_quantize_type='moving_average_abs_max',
+                 weight_bits=8,
+                 activation_bits=8,
                  moving_rate=0.9,
-                 quantizable_layer_type=['Conv2D', 'Linear'],
                  weight_preprocess_layer=None,
                  act_preprocess_layer=None,
                  weight_quantize_layer=None,
                  act_quantize_layer=None):
-        r"""
+        """
         The constructor for ImperativeQuantAware.
 
         Args:
-            weight_bits(int): quantization bit number for weights,
-                whereas the bias is not quantized.
-            activation_bits(int): quantization bit number for activations.
+            quantizable_layer_type(list[str]): List the type of layers that
+                will be quantized. Default is ['Conv2D', 'Linear'].
+                The quantizable_op_type in QuantizationFreezePass and
+                ConvertToInt8Pass must be the same as this.
             weight_quantize_type(str): quantization type for weights,
                 which supports 'abs_max' now. The 'moving_average_abs_max'
-                usually is not used for weights, since weights are fixed once the
-                model is well trained.
+                usually is not used for weights, since weights are fixed
+                once the model is well trained.
             activation_quantize_type(str): quantization type for activations,
                 which supports 'abs_max' and 'moving_average_abs_max' now.
-                If using 'abs_max' mode, the quantization scale will be calculated
-                dynamically each step in both training and testing period. If using
-                'moving_average_abs_max', the static quantization scale will be calculated
-                during training and used in inference.
-            moving_rate(float): the parameter for 'moving_average_abs_max' quantization.
-            quantizable_layer_type(list[str]): List the type of layers that will be quantized. 
-                Default is ['Conv2D', 'Linear']. The quantizable_op_type in
-                QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
-            weight_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to preprocess
-                weight before quantization. Using this can quickly test if user's
-                preprocess method works or not. The input is non-quantized
-                weight and function returns processed weight to be quantized.
-                If None, the weight will be quantized directly. Default is None.
-            act_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to preprocess
-                activation before quantization. Using this can quickly test if user's
-                preprocess method works or not. The input is non-quantized
-                activation and function returns processed activation to be quantized.
-                If None, the activation will be quantized directly. Default is None.
-            weight_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to quantize weight.
+                If using 'abs_max' mode, the quantization scale will be
+                calculated dynamically each step in both training and testing
+                period. If using 'moving_average_abs_max', the static
+                quantization scale will be calculated during training and
+                used in inference.
+            weight_bits(int): quantization bit number for weights,
+                whereas the bias is not quantized.
+            activation_bits(int): quantization bit number for activations.
+            moving_rate(float): the parameter for 'moving_average_abs_max'
+                quantization.
+            weight_preprocess_layer(paddle.nn.Layer, optional): A paddle
+                Layer that defines how to preprocess weight before quantization.
+                Using this can quickly test if user's preprocess method works
+                or not. The input is non-quantized weight and function returns
+                processed weight to be quantized.
+                If None, the weight will be quantized directly.
+                Default is None.
+            act_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer
+                that defines how to preprocess activation before quantization.
+                Using this can quickly test if user's preprocess method works
+                or not. The input is non-quantized activation and function returns
+                processed activation to be quantized.
+                If None, the activation will be quantized directly.
+                Default is None.
+            weight_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that
+                defines how to quantize weight.
                 Using this can quickly test if user's quantization method works or not.
                 In this layer, user should both define quantization method and
                 dequantization method, that is, the function's input is non-quantized
-                weight and returns dequantized weight. If None, will use
-                quantization op defined by 'weight_quantize_type'. Default is None.
-            act_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to quantize activation.
+                weight and returns dequantized weight.
+                If None, will use uantization op defined by 'weight_quantize_type'.
+                Default is None.
+            act_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines
+                how to quantize activation.
                 Using this can quickly test if user's quantization method works or not.
                 In this layer, user should both define quantization method and
                 dequantization method, that is, the function's input is non-quantized
-                activation and returns dequantized activation. If None, will use
-                quantization op defined by 'activation_quantize_type'. Default is None.
+                activation and returns dequantized activation. 
+                If None, will use quantization op defined by 'activation_quantize_type'.
+                Default is None.
 
         Note:
-            If user sets attribute 'skip_quant' to a Layer that support dynamic quantization and sets
-            it to true, the layer would not be quantized during training. If this attribute is not sets
-            or the attribute is false, the Layer would be qunatized in training.
+            If user sets attribute 'skip_quant' to a Layer that support dynamic
+            quantization and sets it to true, the layer would not be quantized
+            during training. If this attribute is not sets or the attribute is
+            false, the Layer would be qunatized in training.
 
         Examples 1:
         .. code-block:: python
@@ -196,141 +194,175 @@ class ImperativeQuantAware(object):
                 model_path="./imperative_model_qat")
         """
         super(ImperativeQuantAware, self).__init__()
-        self._weight_bits = weight_bits
-        self._activation_bits = activation_bits
-        self._moving_rate = moving_rate
-        self._activation_quantize_type = activation_quantize_type
-        self._weight_quantize_type = weight_quantize_type
-
-        self._weight_pre_layer = weight_preprocess_layer
-        self._act_pre_layer = act_preprocess_layer
-        self._weight_quant_layer = weight_quantize_layer
-        self._act_quant_layer = act_quantize_layer
-        self._out_scale = ImperativeCalcOutScale()
-
-        t_check = lambda method: method is None or issubclass(method, dygraph.layers.Layer)
-        assert t_check(
-            self._weight_pre_layer), "weight_preprocess should be nn.Layer"
-        assert t_check(self._act_pre_layer), "act_preprocess should be nn.Layer"
-        assert t_check(
-            self._weight_quant_layer), "weight_quantize should be nn.Layer"
-        assert t_check(self._act_quant_layer), "act_quantize should be nn.Layer"
-
-        quant_type = {
-            'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
-        }
 
-        assert activation_quantize_type != 'channel_wise_abs_max', \
-            "The activation quantization type does not support 'channel_wise_abs_max'."
-        if activation_quantize_type not in quant_type:
-            raise ValueError(
-                "Unknown activation_quantize_type : '%s'. It can only be "
-                "'abs_max' or 'moving_average_abs_max' now." %
-                (str(activation_quantize_type)))
-        if weight_quantize_type not in quant_type:
-            raise ValueError(
-                "Unknown weight_quantize_type: '%s'. It can only be "
-                "'abs_max' or 'moving_average_abs_max' or 'channel_wise_abs_max' now."
-                % (str(weight_quantize_type)))
-
-        self._quant_layers_map = {
-            'Conv2D': Conv2D,
-            'Linear': Linear,
-            'Pool2D': Pool2D,
-            'ReLU': ReLU,
-            'LeakyReLU': LeakyReLU,
-            'ReLU6': ReLU6,
-            'Softmax': Softmax,
-            'Tanh': Tanh,
-            'Swish': Swish
+        kwargs = {
+            "quantizable_layer_type": quantizable_layer_type,
+            "weight_quantize_type": weight_quantize_type,
+            "activation_quantize_type": activation_quantize_type,
+            "weight_bits": weight_bits,
+            "activation_bits": activation_bits,
+            "moving_rate": moving_rate,
+            "weight_preprocess_layer": weight_preprocess_layer,
+            "act_preprocess_layer": act_preprocess_layer,
+            "weight_quantize_layer": weight_quantize_layer,
+            "act_quantize_layer": act_quantize_layer
         }
-        self._quantizable_layer_type = tuple(
-            self._quant_layers_map[layer]
-            if layer in self._quant_layers_map else layer
-            for layer in quantizable_layer_type)
-        for layer in self._quantizable_layer_type:
-            assert not isinstance(
-                layer, str), "{} is unspported to be quantized.".format(layer)
+
+        self._quantize_inputs = ImperativeQuantizeInputs(**kwargs)
+
+        self._calc_output_scale = ImperativeCalcOutputScale()
 
     def quantize(self, model):
         """
-        According to weights' and activations' quantization types, the model will be added some fake
-        quant ops, such as fake_quantize_dequantize_moving_average_abs_max, fake_quantize_dequantize_abs_max
-        and so on. At the same time, the out_scale value of outputs would be calculated.
+        According to weights' and activations' quantization types,
+        the model will be added some fake quant ops, such as
+        fake_quantize_dequantize_moving_average_abs_max,
+        fake_quantize_dequantize_abs_max and so on. At the same time,
+        the out_scale value of outputs would be calculated.
 
         Args:
             model(fluid.dygraph.Layer): the model to be quantized.
         Returns:
             None
         """
+        assert isinstance(model, dygraph.Layer), \
+            "The model must be the instance of dygraph.Layer."
+        self._quantize_inputs.apply(model)
+        self._calc_output_scale.apply(model)
+
+    def save_quantized_model(self, layer, path, input_spec=None, **config):
+        self._calc_output_scale.save_quantized_model(layer, path, input_spec,
+                                                     **config)
+
+
+class ImperativeQuantizeInputs(object):
+    """
+    Based on the input params, add the quant_dequant computational
+    logic both for activation inputs and weight inputs.
+    """
+
+    def __init__(self,
+                 quantizable_layer_type=['Conv2D', 'Linear'],
+                 weight_quantize_type='abs_max',
+                 activation_quantize_type='moving_average_abs_max',
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 weight_preprocess_layer=None,
+                 act_preprocess_layer=None,
+                 weight_quantize_layer=None,
+                 act_quantize_layer=None):
+        """
+        The constructor for ImperativeQuantizeInputs. 
+
+        Please refer to the args of ImperativeQuantAware.
+        """
+        super(ImperativeQuantizeInputs, self).__init__()
+
+        self._quantizable_layer_type = tuple(
+            utils._quant_layers_map[layer]
+            if layer in utils._quant_layers_map else layer
+            for layer in quantizable_layer_type)
+        for layer in self._quantizable_layer_type:
+            assert not isinstance(layer, str), \
+                "%s is unspported to be quantized." % layer
+
+        quantize_type = {
+            'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
+        }
+        assert weight_quantize_type in quantize_type, \
+            "Unsupported weight_quantize_type: %s. It can only " \
+            "be abs_max or moving_average_abs_max or " \
+            "channel_wise_abs_max." % weight_quantize_type
+        assert activation_quantize_type != 'channel_wise_abs_max' \
+            and activation_quantize_type in quantize_type, \
+            "Unsupported activation_quantize_type: %s. It can " \
+            "only be abs_max or moving_average_abs_max now." \
+            % activation_quantize_type
+
+        bits_check = lambda bits: isinstance(bits, int) \
+            and bits >= 0 and bits <= 16
+        assert bits_check(weight_bits), \
+            "weight_bits should be 1, 2,... or 16."
+        assert bits_check(activation_bits), \
+            "activation_bits should be 1, 2,... or 16."
+
+        layer_check = lambda method: method is None or \
+            issubclass(method, dygraph.layers.Layer)
+        assert layer_check(weight_preprocess_layer), \
+            "weight_preprocess should be nn.Layer."
+        assert layer_check(act_preprocess_layer), \
+            "act_preprocess should be nn.Layer."
+        assert layer_check(weight_quantize_layer), \
+            "weight_quantize should be nn.Layer."
+        assert layer_check(act_quantize_layer), \
+            "act_quantize should be nn.Layer."
+
+        self._kwargs = {
+            "weight_quantize_type": weight_quantize_type,
+            "activation_quantize_type": activation_quantize_type,
+            "weight_bits": weight_bits,
+            "activation_bits": activation_bits,
+            "moving_rate": moving_rate,
+            "weight_pre_layer": weight_preprocess_layer,
+            "act_pre_layer": act_preprocess_layer,
+            "weight_quant_layer": weight_quantize_layer,
+            "act_quant_layer": act_quantize_layer
+        }
+
+    def apply(self, model):
+        assert isinstance(model, dygraph.Layer), \
+            "The model must be the instance of dygraph.Layer."
+
         for name, layer in model.named_sublayers():
-            if not isinstance(layer, self._quantizable_layer_type):
-                continue
-            if hasattr(layer, "skip_quant") and layer.skip_quant == True:
+            if not isinstance(layer, self._quantizable_layer_type) \
+                or (hasattr(layer, "skip_quant") \
+                    and layer.skip_quant == True):
                 continue
 
+            # TODO(jc): optimize this module
             last_idx = 0
             idx = 0
             obj = model
-            parent = model
-
             while idx < len(name):
                 if (name[idx] == '.'):
-                    if hasattr(parent, name[last_idx:idx]):
+                    if hasattr(obj, name[last_idx:idx]):
                         obj = getattr(obj, name[last_idx:idx])
-                        parent = obj
                         last_idx = idx + 1
                 idx += 1
             target = name[last_idx:idx]
 
-            quant_layer = self._get_quantized_counterpart(layer)
+            quant_layer = self._get_quantized_layer(layer)
             setattr(quant_layer, "layer_name", layer.full_name())
             setattr(obj, target, quant_layer)
 
-        self._out_scale.calc_out_scale(model)
-
-    def _get_quantized_counterpart(self, layer):
-        quant_layers = tuple(self._quant_layers_map.values())
-        quantized_counterpart = tuple('Quantized' + k
-                                      for k in self._quant_layers_map.keys())
-
-        predicate = lambda value: isinstance(layer, value)
-        index_generator = (i for i, v in enumerate(quant_layers)
-                           if predicate(v))
-
-        try:
-            index = next(index_generator)
-        except StopIteration:
-            _logger.fatal("The layer {} is unsupported to be quantized.".format(
-                layer.full_name()))
-            sys.exit(-1)
+    def _get_quantized_layer(self, layer):
+        quant_layer_name = None
+        for key, value in utils._quant_layers_map.items():
+            if isinstance(layer, value):
+                quant_layer_name = 'Quantized' + key
+                break
+        assert quant_layer_name is not None, \
+            "The layer %s is unsupported to be quantized." \
+            % layer.full_name()
 
         layer_with_weight = ['QuantizedConv2D', 'QuantizedLinear']
-        if quantized_counterpart[index] not in layer_with_weight:
-            quant_layer_class_name = 'QuantizedNoweightLayer'
-        else:
-            quant_layer_class_name = quantized_counterpart[index]
-        quantized_layer = quant_nn.__dict__[quant_layer_class_name](
-            layer, self._weight_bits, self._activation_bits, self._moving_rate,
-            self._weight_quantize_type, self._activation_quantize_type,
-            self._weight_pre_layer, self._act_pre_layer,
-            self._weight_quant_layer, self._act_quant_layer)
-        return quantized_layer
+        if quant_layer_name not in layer_with_weight:
+            quant_layer_name = 'QuantizedNoweightLayer'
 
-    def save_quantized_model(self, layer, path, input_spec=None, **config):
-        self._out_scale.save_quantized_model(layer, path, input_spec, **config)
+        return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs)
 
 
-class ImperativeCalcOutScale(object):
+class ImperativeCalcOutputScale(object):
     def __init__(self, moving_rate=0.9):
         """
-        Add the logic of calculating and setting output quantization scales of some layers.
-        These output quantization scales may be used by tensorRT or some other inference engines.
+        Add the logic of calculating and setting output scales of some layers. 
 
         Args:
-            moving_rate(float): The decay coefficient of moving average. The default value is 0.9.
+            moving_rate(float): The decay coefficient of moving average.
+                                The default value is 0.9.
         """
-        super(ImperativeCalcOutScale, self).__init__()
+        super(ImperativeCalcOutputScale, self).__init__()
         self._moving_rate = moving_rate
         self._out_scale_layer_type_list = (
             BatchNorm, BatchNorm1D, BatchNorm2D, BatchNorm3D, Conv2D, LeakyReLU,
@@ -339,83 +371,22 @@ class ImperativeCalcOutScale(object):
         self._register_hook_handle_list = []
         self._out_scale_dict = collections.OrderedDict()
 
-    # Determine whether layer supports calculation out_scale
-    def _is_matched_layer(self, layer):
-        if not isinstance(layer, self._out_scale_layer_type_list):
-            if 'quantized_' not in layer.full_name():
-                return False
-        return True
-
-    # When inferenc model is saved, the logic in hook would not be executed
-    # in program translation, so that some parameters can not created in
-    # __init__, which would cause the model to fail to save. Therefore, the
-    # parameters creation in the hook is advanced to be exected outside the hook.
-    def _add_new_parameters(self, layer, name=None):
-        dtype = layer._dtype if layer._dtype is not None else "float32"
-        if dtype not in ["float32", "float64"]:
-            return
-        scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
-        scale_name = unique_name.generate(scale_prefix)
-        scale_attr = ParamAttr(
-            name=scale_name, initializer=Constant(1), trainable=False)
-        layer._quant_out_scale = layer.create_parameter(
-            shape=[1], attr=scale_attr, dtype=dtype)
-        layer._quant_out_scale.stop_gradient = True
-
-        state_prefix = "{}.state".format(name) if name else 'outscale.state'
-        state_attr = ParamAttr(
-            name=unique_name.generate(state_prefix),
-            initializer=Constant(1),
-            trainable=False)
-        layer._quant_out_state = layer.create_parameter(
-            shape=[1], attr=state_attr, dtype=dtype)
-        layer._quant_out_state.stop_gradient = True
-
-        accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
-        accum_attr = ParamAttr(
-            name=unique_name.generate(accum_prefix),
-            initializer=Constant(1),
-            trainable=False)
-        layer._quant_out_accum = layer.create_parameter(
-            shape=[1], attr=accum_attr, dtype=dtype)
-        layer._quant_out_accum.stop_gradient = True
-
-    # Judge whether the op in program matches the Layer in dynamic model
-    def _is_op_matched(self, layer_name, op, block):
-        output_var_names = quantization_pass._get_op_output_var_names(op)
-        for output_var_name in output_var_names:
-            output_var_tensor = block.var(output_var_name)
-            if output_var_tensor.dtype not in [
-                    core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32
-            ]:
-                return False
-
-        # Because the naming styles of static and dynamic graph are different,
-        # in order to avoid mistakes, we unify the name here.
-        op_type = output_var_names[0].split(".")[0]
-        op_type = op_type.rsplit("_", 1)[0]
-        if op_type == 'depthwise_conv2d':
-            op_type = 'conv2d'
-        if 'prelu' in op_type:
-            op_type = op_type.replace('prelu', 'p_re_lu')
-        if 'relu' in op_type:
-            op_type = op_type.replace('relu', 're_lu')
-        return op_type in layer_name
-
-    def calc_out_scale(self, model):
+    def apply(self, model):
         """
-        Insert the `moving_average_abs_max_scale` op to calculate output scale of Specific layers in model.
+        Insert the `moving_average_abs_max_scale` op to calculate output 
+        scale of specific layers in model.
 
         Args:
-            model(fluid.dygraph.Layer): The target model which would be calculate the output quantization scale.
+            model(fluid.dygraph.Layer): The target model which would be
+            calculate the output quantization scale.
 
         Returns:
             None
         """
-        assert isinstance(
-            model, dygraph.Layer), "model must be the instance of dygraph.Layer"
+        assert isinstance(model, dygraph.Layer), \
+            "The model must be the instance of dygraph.Layer."
         for _, layer in model.named_sublayers():
-            if self._is_matched_layer(layer):
+            if self._is_target_layer(layer):
                 self._add_new_parameters(layer)
                 forward_post_hook_handle = layer.register_forward_post_hook(
                     self._forward_post_hook)
@@ -459,7 +430,7 @@ class ImperativeCalcOutScale(object):
                                                       .numpy())
             else:
                 for _, sub_layer in self._layer.named_sublayers():
-                    if self._is_matched_layer(sub_layer):
+                    if self._is_target_layer(sub_layer):
                         layer_name = sub_layer.full_name()
                         if hasattr(sub_layer, "layer_name"):
                             layer_name = sub_layer.layer_name
@@ -510,7 +481,7 @@ class ImperativeCalcOutScale(object):
             forward_op = None
             for block in inference_program.blocks:
                 for op in block.ops:
-                    if op.type in _op_real_in_out_name:
+                    if op.type in utils._op_real_in_out_name:
                         if op_count > len(ops_list):
                             warnings.warn(
                                 "The number of Layer which has out_threshold attribute should be bigger than the op in inference model"
@@ -567,6 +538,66 @@ class ImperativeCalcOutScale(object):
         if is_dynamic_mode:
             paddle.disable_static()
 
+    def _is_target_layer(self, layer):
+        return isinstance(layer, self._out_scale_layer_type_list) \
+            or 'quantized_' in layer.full_name()
+
+    # When inferenc model is saved, the logic in hook would not be executed
+    # in program translation, so that some parameters can not created in
+    # __init__, which would cause the model to fail to save. Therefore, the
+    # parameters creation in the hook is advanced to be exected outside the hook.
+    def _add_new_parameters(self, layer, name=None):
+        dtype = layer._dtype if layer._dtype is not None else "float32"
+        if dtype not in ["float32", "float64"]:
+            return
+        scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
+        scale_name = unique_name.generate(scale_prefix)
+        scale_attr = ParamAttr(
+            name=scale_name, initializer=Constant(1), trainable=False)
+        layer._quant_out_scale = layer.create_parameter(
+            shape=[1], attr=scale_attr, dtype=dtype)
+        layer._quant_out_scale.stop_gradient = True
+
+        state_prefix = "{}.state".format(name) if name else 'outscale.state'
+        state_attr = ParamAttr(
+            name=unique_name.generate(state_prefix),
+            initializer=Constant(1),
+            trainable=False)
+        layer._quant_out_state = layer.create_parameter(
+            shape=[1], attr=state_attr, dtype=dtype)
+        layer._quant_out_state.stop_gradient = True
+
+        accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
+        accum_attr = ParamAttr(
+            name=unique_name.generate(accum_prefix),
+            initializer=Constant(1),
+            trainable=False)
+        layer._quant_out_accum = layer.create_parameter(
+            shape=[1], attr=accum_attr, dtype=dtype)
+        layer._quant_out_accum.stop_gradient = True
+
+    # Judge whether the op in program matches the Layer in dynamic model
+    def _is_op_matched(self, layer_name, op, block):
+        output_var_names = quantization_pass._get_op_output_var_names(op)
+        for output_var_name in output_var_names:
+            output_var_tensor = block.var(output_var_name)
+            if output_var_tensor.dtype not in [
+                    core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32
+            ]:
+                return False
+
+        # Because the naming styles of static and dynamic graph are different,
+        # in order to avoid mistakes, we unify the name here.
+        op_type = output_var_names[0].split(".")[0]
+        op_type = op_type.rsplit("_", 1)[0]
+        if op_type == 'depthwise_conv2d':
+            op_type = 'conv2d'
+        if 'prelu' in op_type:
+            op_type = op_type.replace('prelu', 'p_re_lu')
+        if 'relu' in op_type:
+            op_type = op_type.replace('relu', 're_lu')
+        return op_type in layer_name
+
     def _forward_post_hook(self, layer, input, output):
         assert isinstance(
             output, (core.VarBase, framework.Variable)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
new file mode 100644
index 0000000000..a732181db7
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Linear, Conv2D
+from paddle.fluid.dygraph.nn import Pool2D
+from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6
+from paddle.nn.layer.activation import Tanh, Softmax, PReLU, Swish
+
+_op_real_in_out_name = {
+    "conv2d": [["Input", "Filter"], ["Output"]],
+    "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
+    "pool2d": [["X"], ["Out"]],
+    "elementwise_add": [["X", "Y"], ["Out"]],
+    "softmax": [["X"], ["Out"]],
+    "relu": [["X"], ["Out"]],
+    "relu6": [["X"], ["Out"]],
+    "leaky_relu": [["X"], ["Out"]],
+    "prelu": [["X"], ["Out"]],
+    "tanh": [["X"], ["Out"]],
+    "batch_norm": [["X"], ["Y"]],
+    "sigmoid": [["X"], ["Out"]],
+    "swish": [["X"], ["Out"]],
+}
+
+_quant_layers_map = {
+    'Conv2D': Conv2D,
+    'Linear': Linear,
+    'Pool2D': Pool2D,
+    'ReLU': ReLU,
+    'LeakyReLU': LeakyReLU,
+    'ReLU6': ReLU6,
+    'Softmax': Softmax,
+    'Tanh': Tanh,
+    'Swish': Swish
+}
-- 
Gitee


From 45043622149bb3920e6e808316a5c6a399f6fbe0 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 17:13:34 +0800
Subject: [PATCH 08/45] [CustomOp]

---
 paddle/fluid/platform/eigen_ext.h     |  96 ++++++++++++++++
 paddle/fluid/platform/float16.h       | 152 +++-----------------------
 paddle/fluid/platform/float16_test.cc |  14 +--
 paddle/fluid/platform/float16_test.cu |   1 +
 4 files changed, 112 insertions(+), 151 deletions(-)

diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h
index 9e2c363046..a8ad729a31 100644
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -17,6 +17,7 @@
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -26,6 +27,7 @@ namespace Eigen {
 using bfloat16 = paddle::platform::bfloat16;
 using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
+using float16 = paddle::platform::float16;
 
 template <typename T>
 struct NumTraits;
@@ -103,6 +105,33 @@ struct NumTraits<complex128> : GenericNumTraits<std::complex<double>> {
   static inline int digits10() { return NumTraits<Real>::digits10(); }
 };
 
+template <>
+struct NumTraits<float16> : GenericNumTraits<float16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline float16 epsilon() {
+    return paddle::platform::raw_uint16_to_float16(0x0800);
+  }
+  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
+  HOSTDEVICE static inline float16 highest() {
+    return paddle::platform::raw_uint16_to_float16(0x7bff);
+  }
+  HOSTDEVICE static inline float16 lowest() {
+    return paddle::platform::raw_uint16_to_float16(0xfbff);
+  }
+  HOSTDEVICE static inline float16 infinity() {
+    return paddle::platform::raw_uint16_to_float16(0x7c00);
+  }
+  HOSTDEVICE static inline float16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7c01);
+  }
+};
+
 namespace numext {
 
 //////////// bfloat methods /////////////
@@ -302,5 +331,72 @@ HOSTDEVICE inline double abs(const complex128& a) {
   return paddle::platform::abs(a);
 }
 
+//////////// float16 methods /////////////
+
+template <>
+HOSTDEVICE inline bool(isnan)(const float16& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const float16& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline float16 exp(const float16& a) {
+  return float16(::expf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 erf(const float16& a) {
+  return float16(::erff(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 log(const float16& a) {
+  return float16(::logf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 tanh(const float16& a) {
+  return float16(::tanhf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 sqrt(const float16& a) {
+  return float16(::sqrtf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 ceil(const float16& a) {
+  return float16(::ceilf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 floor(const float16& a) {
+  return float16(::floorf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 round(const float16& a) {
+  return float16(::roundf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
+  return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+
+template <>
+HOSTDEVICE inline float16 abs(const float16& a) {
+  return float16(::fabs(static_cast<float>(a)));
+}
+
 }  // namespace numext
 }  // namespace Eigen
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index df2a24400b..bdd4d54b3d 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
+#include <cmath>
+#include <iostream>
 #include <limits>
 
 #ifdef PADDLE_WITH_CUDA
@@ -25,18 +28,6 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef __GNUC__
-#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
-#else
-#define PADDLE_GNUC_VER 0
-#endif  // __GNUC__
-
-#ifdef __clang__
-#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
-#else
-#define PADDLE_CLANG_VER 0
-#endif  // __clang__
-
 #if defined(__CUDACC__) && CUDA_VERSION >= 7050
 #define PADDLE_CUDA_FP16
 #include <cuda_fp16.h>
@@ -55,17 +46,15 @@ limitations under the License. */
 
 #define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)
 
-namespace paddle {
-namespace platform {
-
-// Forward declare float16 for eigen.h
-struct float16;
-
-}  // namespace platform
-}  // namespace paddle
-
-#include "paddle/fluid/platform/hostdevice.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
 
 namespace paddle {
 namespace platform {
@@ -73,7 +62,7 @@ namespace platform {
 // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
 // and aligned at least on a 2-byte boundary, which leads to efficient
 // memory access of float16 struct and also makes float16 compatible
-// with CUDA half, ARM float16_t, and Eigen::half data types.
+// with CUDA half, ARM float16_t data types.
 struct PADDLE_ALIGN(2) float16 {
  public:
   uint16_t x;
@@ -100,8 +89,6 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // PADDLE_CUDA_FP16
 
-  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
-
 #ifdef PADDLE_WITH_NATIVE_FP16
   // __fp16 is a native half precision data type for arm cpu,
   // float16_t is an alias for __fp16
@@ -163,11 +150,6 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif
 
-  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
-    x = rhs.x;
-    return *this;
-  }
-
 #ifdef PADDLE_WITH_NATIVE_FP16
   HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
     x = *reinterpret_cast<const uint16_t*>(&rhs);
@@ -245,12 +227,6 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // PADDLE_CUDA_FP16
 
-  HOSTDEVICE inline explicit operator Eigen::half() const {
-    Eigen::half h;
-    h.x = x;
-    return h;
-  }
-
 #ifdef PADDLE_WITH_NATIVE_FP16
   HOSTDEVICE inline explicit operator float16_t() const {
     return *reinterpret_cast<const float16_t*>(this);
@@ -1108,105 +1084,3 @@ HOSTDEVICE inline paddle::platform::float16 abs(
 }
 
 }  // namespace std
-
-namespace Eigen {
-
-using float16 = paddle::platform::float16;
-
-template <>
-struct NumTraits<float16> : GenericNumTraits<float16> {
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
-
-  HOSTDEVICE static inline float16 epsilon() {
-    return paddle::platform::raw_uint16_to_float16(0x0800);
-  }
-  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
-  HOSTDEVICE static inline float16 highest() {
-    return paddle::platform::raw_uint16_to_float16(0x7bff);
-  }
-  HOSTDEVICE static inline float16 lowest() {
-    return paddle::platform::raw_uint16_to_float16(0xfbff);
-  }
-  HOSTDEVICE static inline float16 infinity() {
-    return paddle::platform::raw_uint16_to_float16(0x7c00);
-  }
-  HOSTDEVICE static inline float16 quiet_NaN() {
-    return paddle::platform::raw_uint16_to_float16(0x7c01);
-  }
-};
-
-namespace numext {
-
-template <>
-HOSTDEVICE inline bool(isnan)(const float16& a) {
-  return (paddle::platform::isnan)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isinf)(const float16& a) {
-  return (paddle::platform::isinf)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isfinite)(const float16& a) {
-  return (paddle::platform::isfinite)(a);
-}
-
-template <>
-HOSTDEVICE inline float16 exp(const float16& a) {
-  return float16(::expf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 erf(const float16& a) {
-  return float16(::erff(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 log(const float16& a) {
-  return float16(::logf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 tanh(const float16& a) {
-  return float16(::tanhf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 sqrt(const float16& a) {
-  return float16(::sqrtf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 ceil(const float16& a) {
-  return float16(::ceilf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 floor(const float16& a) {
-  return float16(::floorf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 round(const float16& a) {
-  return float16(::roundf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
-  return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
-}
-
-template <>
-HOSTDEVICE inline float16 abs(const float16& a) {
-  return float16(::fabs(static_cast<float>(a)));
-}
-
-}  // namespace numext
-
-}  // namespace Eigen
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index f607988d92..56633a3511 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -8,26 +8,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "paddle/fluid/platform/float16.h"
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 
 TEST(float16, conversion_cpu) {
-  // Explicit conversion from Eigen::half
-  EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00);
-  EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800);
-  EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555);
-  EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000);
-  EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000);
-  EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff);
-  EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00);
-
   // Conversion from float
   EXPECT_EQ(float16(1.0f).x, 0x3c00);
   EXPECT_EQ(float16(0.5f).x, 0x3800);
@@ -61,8 +54,6 @@ TEST(float16, conversion_cpu) {
   float16 v_assign;
   v_assign = float16(0);
   EXPECT_EQ(v_assign.x, 0x0000);
-  v_assign = Eigen::half(1.0f);
-  EXPECT_EQ(v_assign.x, 0x3c00);
   v_assign = 0.5f;
   EXPECT_EQ(v_assign.x, 0x3800);
   v_assign = 0.33333;
@@ -73,7 +64,6 @@ TEST(float16, conversion_cpu) {
   EXPECT_EQ(v_assign.x, 0x3c00);
 
   // Conversion operator
-  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
   EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
   EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
   EXPECT_EQ(static_cast<int>(float16(-1)), -1);
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 527da79041..d181660e31 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
-- 
Gitee


From 66cbb0e330af6dd4a912a55e1f8bfd41f826d953 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 17:13:59 +0800
Subject: [PATCH 09/45] Convert

---
 python/paddle/fluid/framework.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 04ed384846..036e8ab304 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -877,7 +877,7 @@ def _getitem_impl_(var, item):
                 new_list_tensor.append(dim)
             else:
                 assert (isinstance(dim, int))
-                temp_out = var.block.create_var(dtype='int32')
+                temp_out = var.block.create_var(dtype='int64')
                 fill_constant([1], dim, force_cpu=True, out=temp_out)
                 new_list_tensor.append(temp_out)
         return new_list_tensor
-- 
Gitee


From e573f19a2879019a8da5323f1a2e8610dcb82757 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 17:14:25 +0800
Subject: [PATCH 10/45] support

---
 paddle/fluid/operators/temporal_shift_op.cc   |  19 +-
 paddle/fluid/operators/temporal_shift_op.cu   | 179 +++++++++++----
 paddle/fluid/operators/temporal_shift_op.h    | 211 ++++++++++++------
 python/paddle/fluid/layers/nn.py              |  22 +-
 .../tests/unittests/test_temporal_shift_op.py |  33 ++-
 5 files changed, 338 insertions(+), 126 deletions(-)

diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
index 2e87447ed1..acf99d09ff 100644
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
@@ -80,7 +80,8 @@ class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X",
              "The input tensor of temporal shift operator. "
-             "This is a 4-D tensor with shape of [N*T,  C, H, W]. "
+             "This is a 4-D tensor with shape of [N*T, C, H, W] "
+             "or [N*T, H, W, C]. "
              "While N is the batch size, T is the temporal segment "
              "number, C is the channel number, H is the height of "
              "features and W is the width of features. "
@@ -100,15 +101,23 @@ class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
         "by 1 along the temporal dimension. :attr:`shift_ratio` should be in "
         "range [0, 0.5]. Default 0.25.")
         .SetDefault(0.25);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "an optional string from: \"NHWC\", \"NCHW\". "
+        "Specify that the data format of the input and output data is "
+        "channel_first or channel_last.")
+        .SetDefault("NCHW");
 
     AddComment(R"DOC(
           This operator calculates the temporal shifting features for Input(X).
 
-          Input(X) should be in shape of [N*T, C, H, W], while N is the batch
-          size, T is the temporal segment number specified by :attr:`seg_num`, 
-          C is the channel number, H and W is the height and width of features.
+          Input(X) should be in shape of [N*T, C, H, W] or [N*T, H, W, C], while 
+          N is the batch size, T is the temporal segment number specified by 
+          :attr:`seg_num`, C is the channel number, H and W is the height and 
+          width of features.
 
-          Temporal Shifting is calculated as follows:
+          Temporal Shifting is calculated as follows when data format is NCHW:
           
           Step 1: Reshape Input(X) to [N, T, C, H, W].
 
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index 4f2d7ce3cf..cb1ff5335c 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -19,22 +19,46 @@ namespace operators {
 using framework::Tensor;
 
 template <typename T>
-__global__ void KeTemporalShiftFw(const T* input, T* output, const int ntchw,
-                                  const int tchw, const int chw, const int hw,
-                                  const int w, const int t, const int c,
-                                  const float shift_ratio) {
+__global__ void KeTemporalShiftFwNCHW(const T* input, T* output,
+                                      const int ntchw, const int tchw,
+                                      const int chw, const int hw, const int t,
+                                      const int c1, const int c2) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   int src_it = 0;
+
   for (; tid < ntchw; tid += stride) {
-    int in = tid / tchw;
     int it = (tid % tchw) / chw;
     int ic = (tid % chw) / hw;
-    int ih = (tid % hw) / w;
-    int iw = tid % w;
 
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it < 0 || src_it >= t) {
+      output[tid] = 0;
+    } else {
+      output[tid] = input[tid + (src_it - it) * chw];
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeTemporalShiftFwNHWC(const T* input, T* output,
+                                      const int nthwc, const int thwc,
+                                      const int hwc, const int t, const int c,
+                                      const int c1, const int c2) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int src_it = 0;
+
+  for (; tid < nthwc; tid += stride) {
+    int it = (tid % thwc) / hwc;
+    int ic = tid % c;
 
     if (ic < c1) {
       src_it = it - 1;
@@ -47,42 +71,65 @@ __global__ void KeTemporalShiftFw(const T* input, T* output, const int ntchw,
     if (src_it < 0 || src_it >= t) {
       output[tid] = 0;
     } else {
-      int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-      output[tid] = input[src_idx];
+      output[tid] = input[tid + (src_it - it) * hwc];
     }
   }
 }
 
 template <typename T>
-__global__ void KeTemporalShiftBw(const T* output_grad, T* input_grad,
-                                  const int ntchw, const int tchw,
-                                  const int chw, const int hw, const int w,
-                                  const int t, const int c,
-                                  const float shift_ratio) {
+__global__ void KeTemporalShiftBwNCHW(const T* output_grad, T* input_grad,
+                                      const int ntchw, const int tchw,
+                                      const int chw, const int hw, const int t,
+                                      const int c1, const int c2) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   int src_it = 0;
+
   for (; tid < ntchw; tid += stride) {
-    int in = tid / tchw;
     int it = (tid % tchw) / chw;
     int ic = (tid % chw) / hw;
-    int ih = (tid % hw) / w;
-    int iw = tid % w;
-
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
     if (ic < c1) {
-      src_it = it - 1;
+      src_it = it + 1;
     } else if (ic < c2) {
+      src_it = it - 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it >= 0 && src_it < t) {
+      input_grad[tid] = output_grad[tid + (src_it - it) * chw];
+    } else {
+      input_grad[tid] = 0;
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeTemporalShiftBwNHWC(const T* output_grad, T* input_grad,
+                                      const int nthwc, const int thwc,
+                                      const int hwc, const int t, const int c,
+                                      const int c1, const int c2) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int src_it = 0;
+
+  for (; tid < nthwc; tid += stride) {
+    int it = (tid % thwc) / hwc;
+    int ic = tid % c;
+
+    if (ic < c1) {
       src_it = it + 1;
+    } else if (ic < c2) {
+      src_it = it - 1;
     } else {
       src_it = it;
     }
 
     if (src_it >= 0 && src_it < t) {
-      int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-      input_grad[src_idx] = output_grad[tid];
+      input_grad[tid] = output_grad[tid + (src_it - it) * hwc];
+    } else {
+      input_grad[tid] = 0;
     }
   }
 }
@@ -98,27 +145,48 @@ class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<Tensor>("Out");
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const std::string data_format_str = ctx.Attr<std::string>("data_format");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_format_str);
 
     const int nt = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
+    const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1]
+                                                    : input->dims()[3]);
+    const int h = (data_layout == DataLayout::kNCHW ? input->dims()[2]
+                                                    : input->dims()[1]);
+    const int w = (data_layout == DataLayout::kNCHW ? input->dims()[3]
+                                                    : input->dims()[2]);
 
     const int hw = h * w;
     const int chw = c * hw;
     const int tchw = t * chw;
     const int ntchw = nt * chw;
 
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+
+    framework::DDim out_dims = (data_layout == DataLayout::kNCHW
+                                    ? framework::make_ddim({nt, c, h, w})
+                                    : framework::make_ddim({nt, h, w, c}));
     const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
+    T* output_data = output->mutable_data<T>(out_dims, ctx.GetPlace());
 
     int pixelNum = nt * chw;
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
+    int threads = 1024;
+    int grid = (pixelNum + threads - 1) / threads;
+    const auto& dev_ctx = ctx.cuda_device_context();
+    int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads;
+    grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
 
-    KeTemporalShiftFw<T><<<config.block_per_grid, config.thread_per_block, 0,
-                           ctx.cuda_device_context().stream()>>>(
-        input_data, output_data, ntchw, tchw, chw, hw, w, t, c, shift_ratio);
+    if (data_layout == DataLayout::kNCHW) {
+      KeTemporalShiftFwNCHW<
+          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
+    } else {
+      KeTemporalShiftFwNHWC<
+          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
+    }
   }
 };
 
@@ -130,32 +198,49 @@ class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const std::string data_format_str = ctx.Attr<std::string>("data_format");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_format_str);
 
     const int nt = output_grad->dims()[0];
-    const int c = output_grad->dims()[1];
-    const int h = output_grad->dims()[2];
-    const int w = output_grad->dims()[3];
+    const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
+                                                    : output_grad->dims()[3]);
+    const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2]
+                                                    : output_grad->dims()[1]);
+    const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3]
+                                                    : output_grad->dims()[2]);
 
     const int hw = h * w;
     const int chw = c * hw;
     const int tchw = t * chw;
     const int ntchw = nt * chw;
 
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+
+    framework::DDim in_grad_dims = (data_layout == DataLayout::kNCHW
+                                        ? framework::make_ddim({nt, c, h, w})
+                                        : framework::make_ddim({nt, h, w, c}));
     const T* output_grad_data = output_grad->data<T>();
     T* input_grad_data =
-        input_grad->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T>()(
-        ctx.template device_context<platform::CUDADeviceContext>(), input_grad,
-        static_cast<T>(0));
+        input_grad->mutable_data<T>(in_grad_dims, ctx.GetPlace());
 
     int pixelNum = nt * chw;
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
+    int threads = 1024;
+    int grid = (pixelNum + threads - 1) / threads;
+    const auto& dev_ctx = ctx.cuda_device_context();
+    int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads;
+    grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
 
-    KeTemporalShiftBw<T><<<config.block_per_grid, config.thread_per_block, 0,
-                           ctx.cuda_device_context().stream()>>>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, hw, w, t, c,
-        shift_ratio);
+    if (data_layout == DataLayout::kNCHW) {
+      KeTemporalShiftBwNCHW<
+          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2);
+    } else {
+      KeTemporalShiftBwNHWC<
+          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h
index 4c7eed5af4..05364b94c9 100644
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -17,12 +17,106 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
 
-static HOSTDEVICE inline int GetEntryIndex(int in, int it, int ic, int ih,
-                                           int iw, const int tchw,
-                                           const int chw, const int hw,
-                                           const int w) {
-  return in * tchw + it * chw + ic * hw + ih * w + iw;
+template <typename T>
+void TemporalShiftFwNCHW(const T* input, T* output, const int ntchw,
+                         const int tchw, const int chw, const int hw,
+                         const int t, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < ntchw; i++) {
+    int it = (i % tchw) / chw;
+    int ic = (i % chw) / hw;
+
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it < 0 || src_it >= t) {
+      output[i] = 0;
+    } else {
+      output[i] = input[i + (src_it - it) * chw];
+    }
+  }
+}
+
+template <typename T>
+void TemporalShiftFwNHWC(const T* input, T* output, const int nthwc,
+                         const int thwc, const int hwc, const int t,
+                         const int c, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < nthwc; i++) {
+    int it = (i % thwc) / hwc;
+    int ic = i % c;
+
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it < 0 || src_it >= t) {
+      output[i] = 0;
+    } else {
+      output[i] = input[i + (src_it - it) * hwc];
+    }
+  }
+}
+
+template <typename T>
+void TemporalShiftBwNCHW(const T* output_grad, T* input_grad, const int ntchw,
+                         const int tchw, const int chw, const int hw,
+                         const int t, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < ntchw; i++) {
+    int it = (i % tchw) / chw;
+    int ic = (i % chw) / hw;
+
+    if (ic < c1) {
+      src_it = it + 1;
+    } else if (ic < c2) {
+      src_it = it - 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it >= 0 && src_it < t) {
+      input_grad[i] = output_grad[i + (src_it - it) * chw];
+    } else {
+      input_grad[i] = 0;
+    }
+  }
+}
+
+template <typename T>
+void TemporalShiftBwNHWC(const T* output_grad, T* input_grad, const int nthwc,
+                         const int thwc, const int hwc, const int t,
+                         const int c, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < nthwc; i++) {
+    int it = (i % thwc) / hwc;
+    int ic = i % c;
+
+    if (ic < c1) {
+      src_it = it + 1;
+    } else if (ic < c2) {
+      src_it = it - 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it >= 0 && src_it < t) {
+      input_grad[i] = output_grad[i + (src_it - it) * hwc];
+    } else {
+      input_grad[i] = 0;
+    }
+  }
 }
 
 template <typename T>
@@ -33,44 +127,38 @@ class TemporalShiftKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<Tensor>("Out");
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const std::string data_format_str = ctx.Attr<std::string>("data_format");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_format_str);
 
     const int nt = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+    const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1]
+                                                    : input->dims()[3]);
+    const int h = (data_layout == DataLayout::kNCHW ? input->dims()[2]
+                                                    : input->dims()[1]);
+    const int w = (data_layout == DataLayout::kNCHW ? input->dims()[3]
+                                                    : input->dims()[2]);
 
     const int hw = h * w;
     const int chw = c * hw;
     const int tchw = t * chw;
+    const int ntchw = nt * chw;
 
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+
+    framework::DDim out_dims = (data_layout == DataLayout::kNCHW
+                                    ? framework::make_ddim({nt, c, h, w})
+                                    : framework::make_ddim({nt, h, w, c}));
     const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
-
-    int src_it = 0;
-    for (int i = 0; i < output->numel(); i++) {
-      int in = i / tchw;
-      int it = (i % tchw) / chw;
-      int ic = (i % chw) / hw;
-      int ih = (i % hw) / w;
-      int iw = i % w;
-
-      if (ic < c1) {
-        src_it = it - 1;
-      } else if (ic < c2) {
-        src_it = it + 1;
-      } else {
-        src_it = it;
-      }
-
-      if (src_it < 0 || src_it >= t) {
-        output_data[i] = 0;
-      } else {
-        int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-        output_data[i] = input_data[src_idx];
-      }
+    T* output_data = output->mutable_data<T>(out_dims, ctx.GetPlace());
+
+    if (data_layout == DataLayout::kNCHW) {
+      TemporalShiftFwNCHW<T>(input_data, output_data, ntchw, tchw, chw, hw, t,
+                             c1, c2);
+    } else {
+      TemporalShiftFwNHWC<T>(input_data, output_data, ntchw, tchw, chw, t, c,
+                             c1, c2);
     }
   }
 };
@@ -83,44 +171,39 @@ class TemporalShiftGradKernel : public framework::OpKernel<T> {
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const std::string data_format_str = ctx.Attr<std::string>("data_format");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_format_str);
 
     const int nt = output_grad->dims()[0];
-    const int c = output_grad->dims()[1];
-    const int h = output_grad->dims()[2];
-    const int w = output_grad->dims()[3];
-
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+    const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
+                                                    : output_grad->dims()[3]);
+    const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2]
+                                                    : output_grad->dims()[1]);
+    const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3]
+                                                    : output_grad->dims()[2]);
 
     const int hw = h * w;
     const int chw = c * hw;
     const int tchw = t * chw;
+    const int ntchw = nt * chw;
+
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
+    framework::DDim in_grad_dims = (data_layout == DataLayout::kNCHW
+                                        ? framework::make_ddim({nt, c, h, w})
+                                        : framework::make_ddim({nt, h, w, c}));
     const T* output_grad_data = output_grad->data<T>();
     T* input_grad_data =
-        input_grad->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
-    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
-
-    int src_it = 0;
-    for (int i = 0; i < output_grad->numel(); i++) {
-      int in = i / tchw;
-      int it = (i % tchw) / chw;
-      int ic = (i % chw) / hw;
-      int ih = (i % hw) / w;
-      int iw = i % w;
-
-      if (ic < c1) {
-        src_it = it - 1;
-      } else if (ic < c2) {
-        src_it = it + 1;
-      } else {
-        src_it = it;
-      }
-
-      if (src_it >= 0 && src_it < t) {
-        int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-        input_grad_data[src_idx] = output_grad_data[i];
-      }
+        input_grad->mutable_data<T>(in_grad_dims, ctx.GetPlace());
+
+    if (data_layout == DataLayout::kNCHW) {
+      TemporalShiftBwNCHW<T>(output_grad_data, input_grad_data, ntchw, tchw,
+                             chw, hw, t, c1, c2);
+    } else {
+      TemporalShiftBwNHWC<T>(output_grad_data, input_grad_data, ntchw, tchw,
+                             chw, t, c, c1, c2);
     }
   }
 };
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8d96e46f83..fa8df14c86 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -13334,7 +13334,7 @@ def shuffle_channel(x, group, name=None):
 
 
 @templatedoc()
-def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
+def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
     """
 
     **Temporal Shift Operator**
@@ -13348,6 +13348,8 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It can be "NCHW" or "NHWC". Default: "NCHW".
 
     Returns:
         out(Tensor): The temporal shifting result is a tensor with the
@@ -13365,6 +13367,13 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
             input = paddle.randn([6, 4, 2, 2])
             out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
+                         "Received Attr(data_format): {}.".format(data_format))
+    if in_dygraph_mode():
+        return core.ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
+                                       shift_ratio, 'data_format', data_format)
+
     helper = LayerHelper("temporal_shift", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
     check_type(seg_num, 'seg_num', int, 'temporal_shift')
@@ -13375,16 +13384,15 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
     if not isinstance(seg_num, int):
         raise TypeError("seg_num must be int type.")
 
-    if in_dygraph_mode():
-        return core.ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
-                                       shift_ratio)
-
     helper.append_op(
         type="temporal_shift",
         inputs={"X": x},
         outputs={"Out": out},
-        attrs={"seg_num": seg_num,
-               "shift_ratio": shift_ratio})
+        attrs={
+            "seg_num": seg_num,
+            "shift_ratio": shift_ratio,
+            "data_format": data_format
+        })
     return out
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index 050c38e549..5bab4a52bf 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -22,7 +22,9 @@ import paddle
 from paddle.fluid import core
 
 
-def temporal_shift(x, seg_num, shift_ratio):
+def temporal_shift(x, seg_num, shift_ratio, data_format):
+    if data_format == "NHWC":
+        x = np.transpose(x, (0, 3, 1, 2))
     shape = x.shape
     reshape_x = x.reshape((-1, seg_num, shape[1], shape[2], shape[3]))
     pad_x = np.pad(reshape_x, ((0, 0), (1, 1), (0, 0), (0, 0), (0, 0)),
@@ -33,7 +35,10 @@ def temporal_shift(x, seg_num, shift_ratio):
     slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]
     slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]
     concat_x = np.concatenate([slice1, slice2, slice3], axis=2)
-    return concat_x.reshape(shape)
+    out = concat_x.reshape(shape)
+    if data_format == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))
+    return out
 
 
 class TestTemporalShift(OpTest):
@@ -45,11 +50,13 @@ class TestTemporalShift(OpTest):
         self.attrs = {
             "seg_num": self.seg_num,
             "shift_ratio": self.shift_ratio,
+            "data_format": self.data_format
         }
 
         self.inputs = {"X": x, }
 
-        output = temporal_shift(x, self.seg_num, self.shift_ratio)
+        output = temporal_shift(x, self.seg_num, self.shift_ratio,
+                                self.data_format)
         self.outputs = {"Out": output}
 
     def test_check_output(self):
@@ -63,6 +70,7 @@ class TestTemporalShift(OpTest):
         self.seg_num = 3
         self.shift_ratio = 0.25
         self.dtype = 'float64'
+        self.data_format = 'NCHW'
 
 
 class TestTemporalShift2(TestTemporalShift):
@@ -70,6 +78,7 @@ class TestTemporalShift2(TestTemporalShift):
         self.x_shape = (4, 9, 7, 7)
         self.seg_num = 2
         self.shift_ratio = 0.2
+        self.data_format = 'NCHW'
 
 
 class TestTemporalShift3(TestTemporalShift):
@@ -77,6 +86,15 @@ class TestTemporalShift3(TestTemporalShift):
         self.x_shape = (3, 10, 5, 5)
         self.seg_num = 1
         self.shift_ratio = 0.3
+        self.data_format = 'NCHW'
+
+
+class TestTemporalShift4(TestTemporalShift):
+    def initTestCase(self):
+        self.x_shape = (6, 5, 5, 4)
+        self.seg_num = 3
+        self.shift_ratio = 0.25
+        self.data_format = 'NHWC'
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -87,6 +105,7 @@ class TestTemporalShiftFP16(TestTemporalShift):
         self.seg_num = 1
         self.shift_ratio = 0.3
         self.dtype = 'float16'
+        self.data_format = 'NCHW'
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
@@ -114,6 +133,14 @@ class TestTemporalShiftAPI(unittest.TestCase):
             out = paddle.nn.functional.temporal_shift(
                 x=input, seg_num=2, shift_ratio=0.2)
 
+    def test_error(self):
+        def attr_data_format():
+            input = paddle.randn([6, 4, 2, 2])
+            out = paddle.nn.functional.temporal_shift(
+                x=input, seg_num=2, shift_ratio=0.2, data_format="HWC")
+
+        self.assertRaises(ValueError, attr_data_format)
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
Gitee


From 30009f8d3f0aead5b5a5632b619f9f9a5f3b2f07 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 18:35:53 +0800
Subject: [PATCH 11/45] remove

---
 python/paddle/fluid/dataloader/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index ac90cbafe1..e46083295d 100755
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -14,8 +14,8 @@
 
 from __future__ import print_function
 
+import paddle
 from .. import framework
-import paddle.dataset.common
 
 __all__ = [
     "Dataset", "IterableDataset", "TensorDataset", "ComposeDataset",
-- 
Gitee


From 103974c8570f3b102a86070f31cdac3507fd06a6 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 18:36:19 +0800
Subject: [PATCH 12/45] =?UTF-8?q?=E3=80=90Paddle.Fleet=E3=80=91Fix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../distributed/fleet/runtime/the_one_ps.py   |  3 +-
 .../fleet/parameter_server/ir/public.py       |  2 +-
 .../fleet/parameter_server/ir/trainer_pass.py |  2 +-
 .../tests/unittests/test_dist_fleet_base.py   | 15 ++--
 .../unittests/test_dist_fleet_grad_clip.py    | 87 +++++++++++--------
 5 files changed, 62 insertions(+), 47 deletions(-)

diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index abec4710f5..a568680600 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -150,7 +150,8 @@ class CommonAccessor:
         oop = None
 
         for op in optimizer_ops:
-            if op.input("Param")[0] == param_name:
+            if ("Param" in op.input_names) and (
+                    op.input("Param")[0] == param_name):
                 oop = op
                 break
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index b987e01bba..baf8add04c 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -31,7 +31,7 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundR
 from paddle.fluid.transpiler.details.program_utils import delete_ops
 
 OP_NAME_SCOPE = "op_namescope"
-CLIP_OP_NAME_SCOPE = "@CLIP"
+CLIP_OP_NAME_SCOPE = "gradient_clip"
 STEP_COUNTER = "@PS_STEP_COUNTER@"
 LEARNING_RATE_DECAY_COUNTER = "@LR_DECAY_COUNTER@"
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 2292d4c0a4..08e64c15c4 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -32,7 +32,7 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_ta
 from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
 
 OP_NAME_SCOPE = "op_namescope"
-CLIP_OP_NAME_SCOPE = "@CLIP"
+CLIP_OP_NAME_SCOPE = "gradient_clip"
 STEP_COUNTER = "@PS_STEP_COUNTER@"
 OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
 RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index 03d7251f82..e84e91de0b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -18,6 +18,7 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distribu
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.fluid as fluid
+import paddle
 """
     high level unit test for distribute fleet.
 """
@@ -112,23 +113,21 @@ class FleetDistRunnerBase(object):
 
     def build_optimizer(self, avg_cost, strategy):
         use_grad_clip = int(os.getenv('GRAD_CLIP', 0))
+        grad_clip = None
         if use_grad_clip:
             # 1: clip_by_value; 2: clip_by_norm; 3:clip_by_global_norm
             if use_grad_clip == 1:
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByValue(2.0))
+                grad_clip = paddle.nn.ClipGradByValue(min=-5.0, max=5.0)
             elif use_grad_clip == 2:
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByNorm(2.0))
+                grad_clip = paddle.nn.ClipGradByNorm(2.0)
             elif use_grad_clip == 3:
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByGlobalNorm(2.0))
+                grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
 
         use_decay = int(os.getenv("USE_DECAY", "0"))
         if use_decay:
             scheduler = paddle.optimizer.lr.ExponentialDecay(
                 learning_rate=LEARNING_RATE, gamma=0.999, verbose=True)
-            optimizer = fluid.optimizer.SGD(scheduler)
+            optimizer = fluid.optimizer.SGD(scheduler, grad_clip=grad_clip)
             """
             # learning rate decay method before 2.0
             optimizer = fluid.optimizer.SGD(
@@ -139,7 +138,7 @@ class FleetDistRunnerBase(object):
                     staircase=True)) 
             """
         else:
-            optimizer = fluid.optimizer.SGD(LEARNING_RATE)
+            optimizer = fluid.optimizer.SGD(LEARNING_RATE, grad_clip=grad_clip)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
index 3c68af474c..f9509d6007 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
@@ -16,53 +16,66 @@ from __future__ import print_function
 
 import os
 import unittest
-import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
 from test_dist_fleet_base import TestFleetBase
-from dist_fleet_simnet_bow import train_network
 
 
-@unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged")
-class TestDistGeoClipByGlobalNormTranspiler(unittest.TestCase):
-    def test_pserver(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])
+class TestDistGeoClipByGlobalNorm(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "geo"
+        self._reader = "dataset"
+        self._geo_sgd_need_push_nums = 5
+        self._grad_clip_mode = 3
 
-        fleet.init(role)
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": ""
+        }
+        required_envs.update(need_envs)
 
-        batch_size = 128
-        is_sparse = True
-        is_distribute = False
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
-        strategy = DistributeTranspilerConfig()
-        strategy.sync_mode = False
-        strategy.geo_sgd_mode = True
-        strategy.geo_sgd_need_push_nums = 5
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
-        avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse)
-        fluid.clip.set_gradient_clip(
-            clip=fluid.clip.GradientClipByGlobalNorm(2.0))
+    def _setup_config(self):
+        self._sync_mode = False
+        self._grad_clip_mode = 2
 
-        optimizer = fluid.optimizer.SGD(0.1)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        optimizer.minimize(avg_cost)
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": ""
+        }
+        required_envs.update(need_envs)
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
-        pserver_startup_program = fleet.startup_program
-        pserver_mian_program = fleet.main_program
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
-@unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged")
-class TestDistGeoClipByGlobalNorm(TestFleetBase):
+class TestDistASyncClipByValue(TestFleetBase):
     def _setup_config(self):
-        self._mode = "geo"
+        self._mode = "async"
         self._reader = "dataset"
-        self._geo_sgd_need_push_nums = 5
-        self._grad_clip_mode = 3
+        self._grad_clip_mode = 1
 
     def check_with_place(self,
                          model_file,
@@ -84,8 +97,11 @@ class TestDistGeoClipByGlobalNorm(TestFleetBase):
         self.check_with_place(
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
+
+class TestDistASyncClipByNorm(TestFleetBase):
     def _setup_config(self):
-        self._sync_mode = False
+        self._mode = "async"
+        self._reader = "dataset"
         self._grad_clip_mode = 2
 
     def check_with_place(self,
@@ -109,7 +125,6 @@ class TestDistGeoClipByGlobalNorm(TestFleetBase):
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
-@unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged")
 class TestDistASyncClipByGlobalNorm(TestFleetBase):
     def _setup_config(self):
         self._mode = "async"
-- 
Gitee


From 2df064c5044b804dcec9d923aecfc54b4d33fa5e Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 18:36:44 +0800
Subject: [PATCH 13/45] [Custom

---
 python/paddle/utils/cpp_extension/extension_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index b68100fe52..1ff42a7bcb 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -442,7 +442,8 @@ def find_cuda_home():
                     [which_cmd, 'nvcc'], stderr=devnull)
                 if six.PY3:
                     nvcc_path = nvcc_path.decode()
-                nvcc_path = nvcc_path.rstrip('\r\n')
+                # Multi CUDA, select the first
+                nvcc_path = nvcc_path.split('\r\n')[0]
 
                 # for example: /usr/local/cuda/bin/nvcc
                 cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
-- 
Gitee


From c28257594b7b853f5e98deb0a9908cc4cd0dfee6 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 18:37:11 +0800
Subject: [PATCH 14/45] r

---
 paddle/fluid/inference/tensorrt/op_teller.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 052d17878a..72338bcef1 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -159,7 +159,11 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       } else {
         int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
-        if (axis <= 0) return false;
+        if (with_dynamic_shape) {
+          if (axis < 0) return false;
+        } else {
+          if (axis <= 0) return false;
+        }
       }
     }
     if (op_type == "transpose2" || op_type == "transpose") {
-- 
Gitee


From ac9f870b293591de04f0d7c0ebd70606308ec300 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 18:43:20 +0800
Subject: [PATCH 15/45] r

---
 .../fluid/inference/api/analysis_predictor.cc |  1 +
 .../inference/tensorrt/convert/CMakeLists.txt |  1 +
 .../inference/tensorrt/convert/gather_op.cc   | 78 +++++++++++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  5 ++
 .../ir/inference/test_trt_gather_op.py        | 70 +++++++++++++++++
 5 files changed, 155 insertions(+)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/gather_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d6080bd692..fc436311f0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1191,6 +1191,7 @@ USE_TRT_CONVERTER(slice);
 USE_TRT_CONVERTER(scale);
 USE_TRT_CONVERTER(stack);
 USE_TRT_CONVERTER(clip);
+USE_TRT_CONVERTER(gather);
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index f9586ca170..59205529ef 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -5,6 +5,7 @@ nv_library(tensorrt_converter
                 pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
                 shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
+                gather_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/gather_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_op.cc
new file mode 100644
index 0000000000..346a8bffa0
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/gather_op.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Gather Op
+ */
+class GatherOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid gather op to tensorrt gather layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("X").front();
+    std::string index_name = op_desc.Input("Index").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    const auto input_tensor = engine_->GetITensor(input_name);
+    const auto index_tensor = engine_->GetITensor(index_name);
+
+    const int axis = 0;
+
+    auto layer = TRT_ENGINE_ADD_LAYER(engine_, Gather, *input_tensor,
+                                      *index_tensor, axis);
+
+    auto odim = layer->getOutput(0)->getDimensions();
+
+    auto reshape_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
+
+    nvinfer1::Dims target_shape{};
+    target_shape.nbDims = odim.nbDims - 1;
+    for (int i = 0; i < axis; ++i) {
+      target_shape.d[i] = odim.d[i];
+    }
+    target_shape.d[axis] = 0;
+    for (int i = axis + 1; i < target_shape.nbDims; ++i) {
+      target_shape.d[i] = odim.d[i + 1];
+    }
+
+    reshape_layer->setReshapeDimensions(target_shape);
+
+    RreplenishLayerAndOutput(reshape_layer, "gather", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(gather, GatherOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 72338bcef1..44939606b4 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -109,6 +109,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "transpose",
       "flatten2",
       "flatten",
+      "gather",
   };
 };
 
@@ -186,6 +187,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (axis != 1) return false;
       }
     }
+    if (op_type == "gather") {
+      // current not support axis from input, use default 0
+      if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
+    }
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
new file mode 100644
index 0000000000..fec15ea729
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTGatherTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name='data', shape=[-1, 512], dtype='float32')
+            index = fluid.data(name='index', shape=[-1], dtype='int32')
+            scale_out = self.append_gather(data, index)
+            out = fluid.layers.batch_norm(scale_out, is_test=True)
+
+        index = np.arange(self.num_gather, dtype='int32')
+        np.random.shuffle(index)
+
+        self.feeds = {
+            "data": np.random.random([self.bs, 512]).astype("float32"),
+            "index": index,
+        }
+
+        self.enable_trt = True
+        self.trt_parameters = TRTGatherTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def set_params(self):
+        self.num_gather = 16
+        self.bs = 32
+
+    def append_gather(self, data, index):
+        return fluid.layers.gather(data, index=index)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTGatherTest1(TRTGatherTest):
+    def set_params(self):
+        self.num_gather = 32
+        self.bs = 32
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
Gitee


From d0c4d63aed923c957cbf8b621b6a6256d44c40e5 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 22 Mar 2021 18:43:44 +0800
Subject: [PATCH 16/45] [CustomOp]

---
 cmake/inference_lib.cmake                     |   6 +
 paddle/fluid/extension/include/ext_dispatch.h |  65 +++++++++
 paddle/fluid/extension/include/ext_dtype.h    |  31 ++--
 paddle/fluid/extension/src/ext_tensor.cc      |  34 +++++
 paddle/fluid/framework/CMakeLists.txt         |   7 +-
 paddle/fluid/framework/custom_operator.cc     |  35 ++++-
 paddle/fluid/framework/custom_tensor_test.cc  |  22 +++
 paddle/fluid/framework/custom_tensor_utils.h  |   8 ++
 paddle/fluid/inference/CMakeLists.txt         |   4 +
 paddle/fluid/pybind/CMakeLists.txt            |   4 +
 .../fluid/tests/custom_op/CMakeLists.txt      |   3 +
 .../fluid/tests/custom_op/custom_conj_op.cc   |  94 ++++++++++++
 .../fluid/tests/custom_op/dispatch_test_op.cc |  56 ++++++++
 .../fluid/tests/custom_op/test_custom_conj.py | 136 ++++++++++++++++++
 .../tests/custom_op/test_dispatch_jit.py      |  20 +++
 python/setup.py.in                            |  20 ++-
 16 files changed, 530 insertions(+), 15 deletions(-)
 create mode 100644 python/paddle/fluid/tests/custom_op/custom_conj_op.cc
 create mode 100644 python/paddle/fluid/tests/custom_op/test_custom_conj.py

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 2cba3d0693..570b37ff11 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -192,6 +192,12 @@ include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
diff --git a/paddle/fluid/extension/include/ext_dispatch.h b/paddle/fluid/extension/include/ext_dispatch.h
index eed7360464..7b3893e283 100644
--- a/paddle/fluid/extension/include/ext_dispatch.h
+++ b/paddle/fluid/extension/include/ext_dispatch.h
@@ -68,6 +68,22 @@ namespace paddle {
     }                                                                         \
   }()
 
+///////// Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                         \
+  [&] {                                                                    \
+    const auto& __dtype__ = TYPE;                                          \
+    switch (__dtype__) {                                                   \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,            \
+                           ::paddle::complex64, __VA_ARGS__)               \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,           \
+                           ::paddle::complex128, __VA_ARGS__)              \
+      default:                                                             \
+        PD_THROW("function " #NAME " is not implemented for data type `" + \
+                 ::paddle::ToString(__dtype__) + "`");                     \
+    }                                                                      \
+  }()
+
 ///////// Floating and Integral Dispatch Marco ///////////
 
 #define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)              \
@@ -93,6 +109,55 @@ namespace paddle {
     }                                                                         \
   }()
 
+///////// Floating and Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)            \
+  [&] {                                                                    \
+    const auto& __dtype__ = TYPE;                                          \
+    switch (__dtype__) {                                                   \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,       \
+                           __VA_ARGS__)                                    \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,      \
+                           __VA_ARGS__)                                    \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,            \
+                           ::paddle::complex64, __VA_ARGS__)               \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,           \
+                           ::paddle::complex128, __VA_ARGS__)              \
+      default:                                                             \
+        PD_THROW("function " #NAME " is not implemented for data type `" + \
+                 ::paddle::ToString(__dtype__) + "`");                     \
+    }                                                                      \
+  }()
+
+///////// Floating, Integral and Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...)  \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,               \
+                           ::paddle::complex64, __VA_ARGS__)                  \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,              \
+                           ::paddle::complex128, __VA_ARGS__)                 \
+      default:                                                                \
+        PD_THROW("function " #NAME " is not implemented for data type `" +    \
+                 ::paddle::ToString(__dtype__) + "`");                        \
+    }                                                                         \
+  }()
+
 // TODO(chenweihang): Add more Marcos in the future if needed
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/ext_dtype.h b/paddle/fluid/extension/include/ext_dtype.h
index 46c4bac236..a1e58fbacd 100644
--- a/paddle/fluid/extension/include/ext_dtype.h
+++ b/paddle/fluid/extension/include/ext_dtype.h
@@ -16,10 +16,15 @@ limitations under the License. */
 #include <cstdint>
 #include <string>
 
+#include "complex128.h"     // NOLINT
+#include "complex64.h"      // NOLINT
 #include "ext_exception.h"  // NOLINT
 
 namespace paddle {
 
+using complex64 = paddle::platform::complex64;
+using complex128 = paddle::platform::complex128;
+
 enum class DataType {
   BOOL,
   INT8,
@@ -29,6 +34,8 @@ enum class DataType {
   INT64,
   FLOAT32,
   FLOAT64,
+  COMPLEX64,
+  COMPLEX128,
   // TODO(JiabinYang) support more data types if needed.
 };
 
@@ -50,20 +57,26 @@ inline std::string ToString(DataType dtype) {
       return "float";
     case DataType::FLOAT64:
       return "double";
+    case DataType::COMPLEX64:
+      return "complex64";
+    case DataType::COMPLEX128:
+      return "complex128";
     default:
       PD_THROW("Unsupported paddle enum data type.");
   }
 }
 
-#define PD_FOR_EACH_DATA_TYPE(_) \
-  _(bool, DataType::BOOL)        \
-  _(int8_t, DataType::INT8)      \
-  _(uint8_t, DataType::UINT8)    \
-  _(int16_t, DataType::INT16)    \
-  _(int, DataType::INT32)        \
-  _(int64_t, DataType::INT64)    \
-  _(float, DataType::FLOAT32)    \
-  _(double, DataType::FLOAT64)
+#define PD_FOR_EACH_DATA_TYPE(_)    \
+  _(bool, DataType::BOOL)           \
+  _(int8_t, DataType::INT8)         \
+  _(uint8_t, DataType::UINT8)       \
+  _(int16_t, DataType::INT16)       \
+  _(int, DataType::INT32)           \
+  _(int64_t, DataType::INT64)       \
+  _(float, DataType::FLOAT32)       \
+  _(double, DataType::FLOAT64)      \
+  _(complex64, DataType::COMPLEX64) \
+  _(complex128, DataType::COMPLEX128)
 
 template <paddle::DataType T>
 struct DataTypeToCPPType;
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
index 4434a3bf59..cb37bf180c 100644
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/extension/include/ext_tensor.h"
+
 #include <utility>
+
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/transform.h"
 
@@ -162,6 +166,10 @@ DataType Tensor::type() const {
     return DataType::FLOAT64;
   } else if (type == framework::proto::VarType::BOOL) {
     return DataType::BOOL;
+  } else if (type == framework::proto::VarType::COMPLEX64) {
+    return DataType::COMPLEX64;
+  } else if (type == framework::proto::VarType::COMPLEX128) {
+    return DataType::COMPLEX128;
   }
   // TODO(JiabinYang) Support more dtype here
   return DataType::FLOAT32;
@@ -217,6 +225,10 @@ template PD_DLL_DECL Tensor
 Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor
 Tensor::copy_to<bool>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
+    const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
+    const PlaceType &target_place) const;
 
 template PD_DLL_DECL float *Tensor::data<float>() const;
 template PD_DLL_DECL double *Tensor::data<double>() const;
@@ -226,6 +238,10 @@ template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
 template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
 template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
 template PD_DLL_DECL bool *Tensor::data<bool>() const;
+template PD_DLL_DECL paddle::platform::complex64 *
+Tensor::data<paddle::platform::complex64>() const;
+template PD_DLL_DECL paddle::platform::complex128 *
+Tensor::data<paddle::platform::complex128>() const;
 
 template PD_DLL_DECL float *Tensor::mutable_data<float>();
 template PD_DLL_DECL double *Tensor::mutable_data<double>();
@@ -235,6 +251,10 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
 template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
+template PD_DLL_DECL paddle::platform::complex64 *
+Tensor::mutable_data<paddle::platform::complex64>();
+template PD_DLL_DECL paddle::platform::complex128 *
+Tensor::mutable_data<paddle::platform::complex128>();
 
 template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
 template PD_DLL_DECL double *Tensor::mutable_data<double>(
@@ -250,6 +270,10 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
     const PlaceType &place);
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::complex64 *
+Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::complex128 *
+Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
 
 std::vector<int64_t> Tensor::shape() const {
   GET_CASTED_TENSOR
@@ -310,6 +334,16 @@ Tensor Tensor::cast(const DataType &target_type) const {
       framework::VisitDataType(
           dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
       break;
+    case framework::proto::VarType::COMPLEX64:
+      framework::VisitDataType(
+          dst_type,
+          CastDataType<paddle::platform::complex64>(*tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::COMPLEX128:
+      framework::VisitDataType(dst_type,
+                               CastDataType<paddle::platform::complex128>(
+                                   *tensor, rlt_tensor_, ctx));
+      break;
     // TODO(JiabinYang) Support more dtype here
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 43bbc06787..1fa4ce9b57 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -346,13 +346,16 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 
 configure_file(commit.h.in commit.h)
 
+# Adapt to custom op mechanism: Include the header files related to the data type
+# to avoid exposing the path of the underlying file
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
+
 cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
 cc_library(op_meta_info SRCS ../extension/src/ext_op_meta_info.cc DEPS custom_tensor)
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper custom_tensor op_meta_info)
 cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
 
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
-
 set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 0baacd4621..69a9be603e 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -757,10 +757,39 @@ void RegisterOperatorWithMetaInfo(
       return new CustomOperator(type, inputs, outputs, attrs);
     };
 
-    // Grad InferShape (gradient's shape is same with forward input default)
-    grad_info.infer_shape_ = [grad_op_outputs](InferShapeContext* ctx) {
+    // Grad InferShape
+    grad_info.infer_shape_ = [grad_op_inputs,
+                              grad_op_outputs](InferShapeContext* ctx) {
+      // 1. if forward input exists, gradient's shape is same with forward input
+      // default
+      //    [Suitable for most situations]
+      // 2. if forward input not exists, and only contains one grad input and
+      // output,
+      //    use grad input shape as grad output shape
+      //    [Suitable for the situation that forward input is not used as
+      //    backward input]
+      // TODO(chenweihang): support set grad op infershape func if needed
       for (auto& out_name : grad_op_outputs) {
-        ctx->ShareDim(detail::NoGrad(out_name), out_name);
+        auto fwd_name = detail::NoGrad(out_name);
+        if (detail::IsDuplicableVar(fwd_name)) {
+          // Duplicable forward var must as backward input
+          ctx->ShareDim(fwd_name, out_name);
+        } else {
+          if (ctx->HasInput(fwd_name)) {
+            ctx->ShareDim(fwd_name, out_name);
+          } else {
+            PADDLE_ENFORCE_EQ(
+                grad_op_inputs.size() == 1UL && grad_op_outputs.size() == 1UL,
+                true,
+                platform::errors::Unavailable(
+                    "Custom grad operator infershape error. "
+                    "If a custom grad operator contains only one input and "
+                    "only one output, the input shape will be directly set to "
+                    "the output shape. Otherwise, Please set the forward input "
+                    "as the grad operator's input."));
+            ctx->ShareDim(grad_op_inputs[0], out_name);
+          }
+        }
       }
     };
 
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 2e42248f64..7da5658860 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -109,6 +109,10 @@ void GroupTestCopy() {
   TestCopyTensor<int8_t>();
   VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<uint8_t>();
+  VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::complex64>();
+  VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::complex128>();
 }
 
 void GroupTestCast() {
@@ -126,6 +130,10 @@ void GroupTestCast() {
   TestCast<uint8_t>(paddle::DataType::FLOAT32);
   VLOG(2) << "float cast";
   TestCast<float>(paddle::DataType::FLOAT32);
+  VLOG(2) << "complex64 cast";
+  TestCast<paddle::complex64>(paddle::DataType::FLOAT32);
+  VLOG(2) << "complex128 cast";
+  TestCast<paddle::complex128>(paddle::DataType::FLOAT32);
 }
 
 void GroupTestDtype() {
@@ -136,6 +144,8 @@ void GroupTestDtype() {
   CHECK(TestDtype<int16_t>() == paddle::DataType::INT16);
   CHECK(TestDtype<int8_t>() == paddle::DataType::INT8);
   CHECK(TestDtype<uint8_t>() == paddle::DataType::UINT8);
+  CHECK(TestDtype<paddle::complex64>() == paddle::DataType::COMPLEX64);
+  CHECK(TestDtype<paddle::complex128>() == paddle::DataType::COMPLEX128);
 }
 
 void GroupTestDtypeConvert() {
@@ -162,6 +172,12 @@ void GroupTestDtypeConvert() {
         paddle::framework::proto::VarType::INT16);
   CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
             paddle::DataType::BOOL) == paddle::framework::proto::VarType::BOOL);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::COMPLEX64) ==
+        paddle::framework::proto::VarType::COMPLEX64);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::COMPLEX128) ==
+        paddle::framework::proto::VarType::COMPLEX128);
   // proto -> enum
   CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
             paddle::framework::proto::VarType::FP64) ==
@@ -185,6 +201,12 @@ void GroupTestDtypeConvert() {
         paddle::DataType::INT16);
   CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
             paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::COMPLEX64) ==
+        paddle::DataType::COMPLEX64);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::COMPLEX128) ==
+        paddle::DataType::COMPLEX128);
 }
 
 TEST(CustomTensor, copyTest) {
diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h
index 919a3a1a49..a252d6aef4 100644
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -56,6 +56,10 @@ class CustomTensorUtils {
         return framework::proto::VarType::INT64;
       case paddle::DataType::INT16:
         return framework::proto::VarType::INT16;
+      case paddle::DataType::COMPLEX64:
+        return framework::proto::VarType::COMPLEX64;
+      case paddle::DataType::COMPLEX128:
+        return framework::proto::VarType::COMPLEX128;
       case paddle::DataType::BOOL:
         return framework::proto::VarType::BOOL;
       default:
@@ -83,6 +87,10 @@ class CustomTensorUtils {
         return paddle::DataType::UINT8;
       case framework::proto::VarType::INT16:
         return paddle::DataType::INT16;
+      case framework::proto::VarType::COMPLEX64:
+        return paddle::DataType::COMPLEX64;
+      case framework::proto::VarType::COMPLEX128:
+        return paddle::DataType::COMPLEX128;
       case framework::proto::VarType::BOOL:
         return paddle::DataType::BOOL;
       default:
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 7a8bfc1a8c..93fd85f13c 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -36,6 +36,10 @@ endif()
 # fluid_modules exclude API-interface of inference/api and inference/capi
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
+# Adapt to custom op mechanism: Include the header files related to the data type
+# to avoid exposing the path of the underlying file
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+
 add_subdirectory(api)
 
 # Create static inference library if needed
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 7a63217d67..5452b2160a 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,3 +1,7 @@
+# Adapt to custom op mechanism: Include the header files related to the data type
+# to avoid exposing the path of the underlying file
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index 620bff11a2..4ba537930c 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -26,6 +26,9 @@ set_tests_properties(test_custom_attrs_jit PROPERTIES TIMEOUT 120)
 py_test(test_custom_concat SRCS test_custom_concat.py)
 set_tests_properties(test_custom_concat PROPERTIES TIMEOUT 120)
 
+py_test(test_custom_conj SRCS test_custom_conj.py)
+set_tests_properties(test_custom_conj PROPERTIES TIMEOUT 120)
+
 py_test(test_check_abi SRCS test_check_abi.py)
 
 cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
diff --git a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
new file mode 100644
index 0000000000..4feb887ca0
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WIdata_tHOUdata_t WARRANdata_tIES OR CONDIdata_tIONS OF ANY KIND, either
+// express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+#define CHECK_INPUT(x) \
+  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+
+template <typename data_t>
+using EnableComplex = typename std::enable_if<
+    std::is_same<data_t, paddle::complex64>::value ||
+    std::is_same<data_t, paddle::complex128>::value>::type;
+
+template <typename data_t>
+using DisableComplex = typename std::enable_if<
+    !std::is_same<data_t, paddle::complex64>::value &&
+    !std::is_same<data_t, paddle::complex128>::value>::type;
+
+template <typename data_t, typename Enable = void>
+struct ConjFunctor;
+
+template <typename data_t>
+struct ConjFunctor<data_t, EnableComplex<data_t>> {
+  ConjFunctor(const data_t* input, int64_t numel, data_t* output)
+      : input_(input), numel_(numel), output_(output) {}
+
+  void operator()(size_t idx) const {
+    output_[idx] = data_t(input_[idx].real, -input_[idx].imag);
+  }
+
+  const data_t* input_;
+  int64_t numel_;
+  data_t* output_;
+};
+
+template <typename data_t>
+struct ConjFunctor<data_t, DisableComplex<data_t>> {
+  ConjFunctor(const data_t* input, int64_t numel, data_t* output)
+      : input_(input), numel_(numel), output_(output) {}
+
+  void operator()(size_t idx) const { output_[idx] = input_[idx]; }
+
+  const data_t* input_;
+  int64_t numel_;
+  data_t* output_;
+};
+
+template <typename data_t>
+void ConjCPUKernel(const data_t* x_data, int64_t numel, data_t* out_data) {
+  ConjFunctor<data_t> conj(x_data, numel, out_data);
+  for (int64_t i = 0; i < numel; ++i) {
+    conj(i);
+  }
+}
+
+std::vector<paddle::Tensor> ConjFunction(const paddle::Tensor& x) {
+  CHECK_INPUT(x);
+
+  paddle::Tensor out(x.place());
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      x.type(), "ConjCPUKernel", ([&] {
+        ConjCPUKernel<data_t>(
+            x.data<data_t>(), x.size(), out.mutable_data<data_t>());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP(custom_conj)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ConjFunction));
+
+PD_BUILD_GRAD_OP(custom_conj)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ConjFunction));
diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
index 33ca6ee86f..fbf5442ac0 100644
--- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -62,3 +62,59 @@ PD_BUILD_OP(dispatch_test_float_and_integer)
     .Inputs({"X"})
     .Outputs({"Out"})
     .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger));
+
+std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP(dispatch_test_complex)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestComplex));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
+    const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP(dispatch_test_float_and_complex)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndComplex));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
+    const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP(dispatch_test_float_and_integer_and_complex)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex));
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
new file mode 100644
index 0000000000..3a8f79a06f
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+
+import paddle
+import paddle.static as static
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+
+# Because Windows don't use docker, the shared lib already exists in the
+# cache dir, it will not be compiled again unless the shared lib is removed.
+file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
+    get_build_directory())
+if os.name == 'nt' and os.path.isfile(file):
+    cmd = 'del {}'.format(file)
+    run_cmd(cmd, True)
+
+custom_ops = load(
+    name='custom_conj_jit',
+    sources=['custom_conj_op.cc'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cxx_cflags=extra_cc_args,  # test for cc flags
+    extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
+    verbose=True)
+
+
+def is_complex(dtype):
+    return dtype == paddle.fluid.core.VarDesc.VarType.COMPLEX64 or \
+      dtype == paddle.fluid.core.VarDesc.VarType.COMPLEX128
+
+
+def to_complex(dtype):
+    if dtype == "float32":
+        return np.complex64
+    elif dtype == "float64":
+        return np.complex128
+    else:
+        return dtype
+
+
+def conj_dynamic(func, dtype, np_input):
+    paddle.set_device("cpu")
+    x = paddle.to_tensor(np_input)
+    out = func(x)
+    out.stop_gradient = False
+    sum_out = paddle.sum(out)
+    if is_complex(sum_out.dtype):
+        sum_out.real().backward()
+    else:
+        sum_out.backward()
+    return out.numpy(), x.grad
+
+
+def conj_static(func, shape, dtype, np_input):
+    paddle.enable_static()
+    paddle.set_device("cpu")
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="x", shape=shape, dtype=dtype)
+            x.stop_gradient = False
+            out = func(x)
+            sum_out = paddle.sum(out)
+            static.append_backward(sum_out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            out_v, x_grad_v = exe.run(static.default_main_program(),
+                                      feed={"x": np_input},
+                                      fetch_list=[out.name, x.name + "@GRAD"])
+    paddle.disable_static()
+    return out_v, x_grad_v
+
+
+class TestCustomConjJit(unittest.TestCase):
+    def setUp(self):
+        self.dtypes = ['float32', 'float64']
+        self.shape = [2, 20, 2, 3]
+
+    def check_output(self, out, pd_out, name):
+        self.assertTrue(
+            np.array_equal(out, pd_out),
+            "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
+                                                           pd_out))
+
+    def run_dynamic(self, dtype, np_input):
+        out, x_grad = conj_dynamic(custom_ops.custom_conj, dtype, np_input)
+        pd_out, pd_x_grad = conj_dynamic(paddle.conj, dtype, np_input)
+
+        self.check_output(out, pd_out, "out")
+        self.check_output(x_grad, pd_x_grad, "x's grad")
+
+    def run_static(self, dtype, np_input):
+        out, x_grad = conj_static(custom_ops.custom_conj, self.shape, dtype,
+                                  np_input)
+        pd_out, pd_x_grad = conj_static(paddle.conj, self.shape, dtype,
+                                        np_input)
+
+        self.check_output(out, pd_out, "out")
+        self.check_output(x_grad, pd_x_grad, "x's grad")
+
+    def test_dynamic(self):
+        for dtype in self.dtypes:
+            np_input = np.random.random(self.shape).astype(dtype)
+            self.run_dynamic(dtype, np_input)
+
+    def test_static(self):
+        for dtype in self.dtypes:
+            np_input = np.random.random(self.shape).astype(dtype)
+            self.run_static(dtype, np_input)
+
+    # complex only used in dynamic mode now
+    def test_complex_dynamic(self):
+        for dtype in self.dtypes:
+            np_input = np.random.random(self.shape).astype(
+                dtype) + 1j * np.random.random(self.shape).astype(dtype)
+            self.run_dynamic(to_complex(dtype), np_input)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
index 6cdbc61620..bc36372c6a 100644
--- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
@@ -55,6 +55,11 @@ class TestJitDispatch(unittest.TestCase):
         for dtype in dtypes:
             self.run_dispatch_test(dispatch_op.dispatch_test_integer, dtype)
 
+    def test_dispatch_complex(self):
+        dtypes = ["complex64", "complex128"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_complex, dtype)
+
     def test_dispatch_float_and_integer(self):
         dtypes = [
             "float32", "float64", "int32", "int64", "int8", "uint8", "int16"
@@ -63,6 +68,21 @@ class TestJitDispatch(unittest.TestCase):
             self.run_dispatch_test(dispatch_op.dispatch_test_float_and_integer,
                                    dtype)
 
+    def test_dispatch_float_and_complex(self):
+        dtypes = ["float32", "float64", "complex64", "complex128"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_complex,
+                                   dtype)
+
+    def test_dispatch_float_and_integer_and_complex(self):
+        dtypes = [
+            "float32", "float64", "int32", "int64", "int8", "uint8", "int16",
+            "complex64", "complex128"
+        ]
+        for dtype in dtypes:
+            self.run_dispatch_test(
+                dispatch_op.dispatch_test_float_and_integer_and_complex, dtype)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 0e214c5c65..0afc3956a0 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -451,12 +451,30 @@ class InstallHeaders(Command):
                                    ('install_headers', 'install_dir'),
                                    ('force', 'force'))
 
+    def copy_data_type_headers(self, header):
+        if os.name == 'nt':
+            data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h']
+        else:
+            data_type_headers = ['platform/complex64.h', 'platform/complex128.h']
+        for dtype_header in data_type_headers:
+            if dtype_header in header:
+                if os.name == 'nt':
+                    install_dir = os.path.join(self.install_dir, "paddle\\fluid\\extension\\include")
+                else:
+                    install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
+                if not os.path.exists(install_dir):
+                    self.mkpath(install_dir)
+                return self.copy_file(header, install_dir)
+
     def mkdir_and_copy_file(self, header):
         if 'pb.h' in header:
             install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header)
         elif 'third_party' not in header:
-            # framework
+            # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
+            # For paddle data type headers, we also need to copy to `extension/incude`,
+            # used for new custom operator
+            self.copy_data_type_headers(header)
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
-- 
Gitee


From 3451676841c2805e9a52f02872cf457511eaa669 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 14:31:02 +0800
Subject: [PATCH 17/45] =?UTF-8?q?!24=20[ROCM]=20fix=20layer=5Fnorm,=20norm?=
 =?UTF-8?q?,=20p=5Fnorm,=20test=5Fsequence=5Fsoftmax=5Fop,=20test=5Fm?=
 =?UTF-8?q?=E2=80=A6=20Merge=20pull=20request=20!24=20from=20PaddlePaddle-?=
 =?UTF-8?q?Gardener/bugfix=5Frocm=5F2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddle/fluid/operators/layer_norm_op.cu                |  9 ++++++++-
 paddle/fluid/operators/norm_op.cu                      |  9 ++++++++-
 paddle/fluid/operators/p_norm_op.cu                    | 10 ++++++++++
 .../unittests/sequence/test_sequence_softmax_op.py     |  6 +++---
 .../tests/unittests/test_math_op_patch_var_base.py     |  7 +++++--
 5 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index d0f7dca98a..3656de3525 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -43,7 +43,11 @@ template <typename T>
 using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
 inline static int GetDesiredBlockDim(int block_dim) {
+#ifdef __HIPCC__
+  const int kMaxBlockDim = 256;
+#else
   const int kMaxBlockDim = 512;
+#endif
   return block_dim >= kMaxBlockDim
              ? kMaxBlockDim
              : (1 << (static_cast<int>(std::log2f(block_dim))));
@@ -698,8 +702,11 @@ static void LayerNormBackward(const T *x, const T *d_y, const U *scale,
                               const framework::ExecutionContext &ctx) {
   auto &dev_ctx = ctx.cuda_device_context();
   auto stream = dev_ctx.stream();
-
+#ifdef __HIPCC__
+  const int kMaxBlockDim = 256;
+#else
   const int kMaxBlockDim = 512;
+#endif
   const int kMaxBlockNum = 128;
   int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
                       ((d_scale != nullptr ? 1 : 0) << 1) |
diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu
index 6b5c70c925..4c1674ded1 100644
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
@@ -79,8 +79,11 @@ class NormCUDAKernel : public framework::OpKernel<T> {
     GetDims(xdim, axis, &pre, &n, &post);
 
     auto& dev_ctx = ctx.cuda_device_context();
-
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
@@ -146,7 +149,11 @@ class NormGradCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index 918f0bb1e4..bd6694abdb 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -142,7 +142,12 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
+
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
@@ -244,7 +249,12 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
+
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
index 92146820da..cb92a68bde 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
@@ -28,10 +28,10 @@ class TestSequenceSoftmaxOp(OpTest):
         self.op_type = "sequence_softmax"
         self.use_cudnn = False
         self.init_op_type()
-
-        x = np.random.uniform(0.1, 1, (110, 1)).astype("float64")
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
+        x = np.random.uniform(0.1, 1, (110, 1)).astype(self.dtype)
         self.init_lod()
-        out = np.zeros((110, 1)).astype("float64")
+        out = np.zeros((110, 1)).astype(self.dtype)
         offset = 0
         for i in range(len(self.lod[0])):
             if (self.lod[0][i] == 0):
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index e908f1a60a..4b097f6359 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -354,8 +354,11 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
                               [1.30058, 1.0688717, 1.4928783],
                               [1.0958099, 1.3724753, 1.8926544]])
         d = d.matmul(d.t())
-        self.assertTrue(
-            np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy()))
+        # ROCM not support cholesky
+        if not fluid.core.is_compiled_with_rocm():
+            self.assertTrue(
+                np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy(
+                )))
 
         self.assertTrue(
             np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))
-- 
Gitee


From 18c972d68346a65edfa9d4e73a43fa1518d307ff Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 14:31:04 +0800
Subject: [PATCH 18/45] !25 [dgraph qat] Refine calculating output scale of
 dygraph qat Merge pull request !25 from
 PaddlePaddle-Gardener/refine_calc_output_scale

---
 .../slim/quantization/imperative/qat.py       | 221 +++++++++---------
 .../slim/quantization/imperative/quant_nn.py  |   4 +
 .../slim/quantization/imperative/utils.py     |  43 ++--
 .../test_imperative_qat_addquantdequant.py    |   4 +-
 4 files changed, 138 insertions(+), 134 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 04aec158ea..abfe06a332 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -25,12 +25,7 @@ from paddle.fluid.executor import Executor
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Constant
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.nn import Linear, Conv2D, Conv2DTranspose, MaxPool2D, MaxPool1D
-from paddle.nn import BatchNorm1D, BatchNorm2D, BatchNorm3D, SyncBatchNorm
-from paddle.fluid.dygraph.nn import BatchNorm, Pool2D
 from paddle.fluid.io import load_inference_model, save_inference_model
-from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6
-from paddle.nn.layer.activation import Tanh, Softmax, PReLU, Swish
 from paddle.fluid.log_helper import get_logger
 from . import quant_nn
 from .. import quantization_pass
@@ -62,14 +57,10 @@ class ImperativeQuantAware(object):
         The constructor for ImperativeQuantAware.
 
         Args:
-            quantizable_layer_type(list[str]): List the type of layers that
-                will be quantized. Default is ['Conv2D', 'Linear'].
-                The quantizable_op_type in QuantizationFreezePass and
-                ConvertToInt8Pass must be the same as this.
+            quantizable_layer_type(list[str | layer]): List the type of
+                layers that will be quantized. Default is ['Conv2D', 'Linear'].
             weight_quantize_type(str): quantization type for weights,
-                which supports 'abs_max' now. The 'moving_average_abs_max'
-                usually is not used for weights, since weights are fixed
-                once the model is well trained.
+                which supports 'abs_max' and 'channel_wise_abs_max'.
             activation_quantize_type(str): quantization type for activations,
                 which supports 'abs_max' and 'moving_average_abs_max' now.
                 If using 'abs_max' mode, the quantization scale will be
@@ -77,8 +68,8 @@ class ImperativeQuantAware(object):
                 period. If using 'moving_average_abs_max', the static
                 quantization scale will be calculated during training and
                 used in inference.
-            weight_bits(int): quantization bit number for weights,
-                whereas the bias is not quantized.
+            weight_bits(int): quantization bit number for weights, whereas
+                the bias is not quantized.
             activation_bits(int): quantization bit number for activations.
             moving_rate(float): the parameter for 'moving_average_abs_max'
                 quantization.
@@ -260,8 +251,8 @@ class ImperativeQuantizeInputs(object):
         super(ImperativeQuantizeInputs, self).__init__()
 
         self._quantizable_layer_type = tuple(
-            utils._quant_layers_map[layer]
-            if layer in utils._quant_layers_map else layer
+            utils.supported_quant_layers_map[layer]
+            if layer in utils.supported_quant_layers_map else layer
             for layer in quantizable_layer_type)
         for layer in self._quantizable_layer_type:
             assert not isinstance(layer, str), \
@@ -338,7 +329,7 @@ class ImperativeQuantizeInputs(object):
 
     def _get_quantized_layer(self, layer):
         quant_layer_name = None
-        for key, value in utils._quant_layers_map.items():
+        for key, value in utils.supported_quant_layers_map.items():
             if isinstance(layer, value):
                 quant_layer_name = 'Quantized' + key
                 break
@@ -364,10 +355,6 @@ class ImperativeCalcOutputScale(object):
         """
         super(ImperativeCalcOutputScale, self).__init__()
         self._moving_rate = moving_rate
-        self._out_scale_layer_type_list = (
-            BatchNorm, BatchNorm1D, BatchNorm2D, BatchNorm3D, Conv2D, LeakyReLU,
-            Linear, PReLU, Pool2D, MaxPool1D, MaxPool2D, ReLU, ReLU6, Sigmoid,
-            Softmax, SyncBatchNorm, Tanh, Swish)
         self._register_hook_handle_list = []
         self._out_scale_dict = collections.OrderedDict()
 
@@ -378,7 +365,7 @@ class ImperativeCalcOutputScale(object):
 
         Args:
             model(fluid.dygraph.Layer): The target model which would be
-            calculate the output quantization scale.
+                calculate the output quantization scale.
 
         Returns:
             None
@@ -387,10 +374,10 @@ class ImperativeCalcOutputScale(object):
             "The model must be the instance of dygraph.Layer."
         for _, layer in model.named_sublayers():
             if self._is_target_layer(layer):
-                self._add_new_parameters(layer)
-                forward_post_hook_handle = layer.register_forward_post_hook(
-                    self._forward_post_hook)
-                self._register_hook_handle_list.append(forward_post_hook_handle)
+                self._init_scale_params(layer)
+                hook_handle = layer.register_forward_post_hook(
+                    self._calc_output_scale_hook)
+                self._register_hook_handle_list.append(hook_handle)
 
     def save_quantized_model(self, layer, path, input_spec=None, **config):
         """
@@ -398,63 +385,64 @@ class ImperativeCalcOutputScale(object):
 
         Args:
             layer (Layer): The Layer to be saved.
-            path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
-            input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward 
-                method, which can be described by InputSpec or example Tensor. If None, all input variables of 
-                the original Layer's forward method would be the inputs of the saved model. Default None.
-            **configs (dict, optional): Other save configuration options for compatibility. We do not 
-                recommend using these configurations, they may be removed in the future. If not necessary, 
-                DO NOT use them. Default None.
+            path (str): The path prefix to save model. The format is 
+                ``dirname/file_prefix`` or ``file_prefix``.
+            input_spec (list[InputSpec|Tensor], optional): Describes the input
+                of the saved model's forward method, which can be described by
+                InputSpec or example Tensor. If None, all input variables of 
+                the original Layer's forward method would be the inputs of
+                the saved model. Default None.
+            **configs (dict, optional): Other save configuration options for
+                compatibility. We do not recommend using these configurations,
+                they may be removed in the future. If not necessary, DO NOT use
+                them. Default None.
                 The following options are currently supported:
-                (1) output_spec (list[Tensor]): Selects the output targets of the saved model.
-                By default, all return variables of original Layer's forward method are kept as the 
-                output of the saved model. If the provided ``output_spec`` list is not all output variables, 
-                the saved model will be pruned according to the given ``output_spec`` list. 
+                (1) output_spec (list[Tensor]): Selects the output targets of
+                the saved model. By default, all return variables of original
+                Layer's forward method are kept as the output of the saved model.
+                If the provided ``output_spec`` list is not all output variables, 
+                the saved model will be pruned according to the given
+                ``output_spec`` list. 
 
         Returns:
             None
         """
 
-        assert isinstance(
-            layer, dygraph.Layer), "model must be the instance of dygraph.Layer"
-        self._layer = layer
-        is_dynamic_mode = False
+        assert isinstance(layer, dygraph.Layer), \
+            "The model must be the instance of dygraph.Layer."
+
+        # remove handles and collect output scales
         with dygraph.guard():
-            self._layer.eval()
-            if self._register_hook_handle_list is not None:
-                for handle in self._register_hook_handle_list:
-                    handle.remove()
-            if self._out_scale_dict:
-                for key in self._out_scale_dict:
-                    self._out_scale_dict[key] = float(self._out_scale_dict[key]
-                                                      .numpy())
-            else:
-                for _, sub_layer in self._layer.named_sublayers():
-                    if self._is_target_layer(sub_layer):
+            layer.eval()
+            for handle in self._register_hook_handle_list:
+                handle.remove()
+            for _, sub_layer in layer.named_sublayers():
+                if self._is_target_layer(sub_layer):
+                    if hasattr(sub_layer, "layer_name"):
+                        layer_name = sub_layer.layer_name
+                    else:
                         layer_name = sub_layer.full_name()
-                        if hasattr(sub_layer, "layer_name"):
-                            layer_name = sub_layer.layer_name
-                        if hasattr(sub_layer, "_quant_out_scale"):
-                            self._out_scale_dict[layer_name] = float(
-                                sub_layer._quant_out_scale)
+                    if hasattr(sub_layer, "_quant_out_scale"):
+                        self._out_scale_dict[layer_name] = float(
+                            sub_layer._quant_out_scale)
 
+        # save the quantized model that doesn't have output scales
+        paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config)
+
+        # load static model
+        is_dynamic_mode = False
         if paddle.in_dynamic_mode():
             is_dynamic_mode = True
             paddle.enable_static()
 
-        paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config)
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
+        place = core.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else core.CPUPlace()
         exe = Executor(place)
 
-        file_prefix = os.path.basename(path)
         dirname = os.path.dirname(path)
-        model_filename = file_prefix + INFER_MODEL_SUFFIX
-        params_filename = file_prefix + INFER_PARAMS_SUFFIX
-
+        basename = os.path.basename(path)
+        model_filename = basename + INFER_MODEL_SUFFIX
+        params_filename = basename + INFER_PARAMS_SUFFIX
         [inference_program, feed_target_names, fetch_targets] = (
             load_inference_model(
                 dirname=dirname,
@@ -462,14 +450,15 @@ class ImperativeCalcOutputScale(object):
                 model_filename=model_filename,
                 params_filename=params_filename))
 
+        # set output scales to the static model
         check_behind_op = False
         op_count = 0
         ops_list = [key for key, _ in self._out_scale_dict.items()]
         if len(ops_list) == 0:
             warnings.warn(
-                "Warning: No Layer of the model while to be saved contains the out_threshold attribute, "
-                "so the generated inference model would not contain the out_threshold."
-            )
+                "Warning: No Layer of the model while to be saved contains "
+                "the out_threshold attribute, so the generated inference "
+                "model would not contain the out_threshold.")
         else:
             # Because the Layer in dygraph may correspond to multiple ops
             # in static program after being saved. To ensure correctness,
@@ -481,11 +470,12 @@ class ImperativeCalcOutputScale(object):
             forward_op = None
             for block in inference_program.blocks:
                 for op in block.ops:
-                    if op.type in utils._op_real_in_out_name:
+                    if op.type in utils.op_real_in_out_name:
                         if op_count > len(ops_list):
                             warnings.warn(
-                                "The number of Layer which has out_threshold attribute should be bigger than the op in inference model"
-                            )
+                                "The number of Layer which has "
+                                "out_threshold attribute should be bigger than "
+                                "the op in inference model")
                             break
                         if check_behind_op:
                             check_behind_op = False
@@ -525,7 +515,7 @@ class ImperativeCalcOutputScale(object):
                                 self._out_scale_dict[ops_list[op_count]])
                             op_count += 1
 
-        # Save the processed program.
+        # save the final quantized model that has output scales
         save_inference_model(
             dirname=dirname,
             feeded_var_names=feed_target_names,
@@ -539,41 +529,40 @@ class ImperativeCalcOutputScale(object):
             paddle.disable_static()
 
     def _is_target_layer(self, layer):
-        return isinstance(layer, self._out_scale_layer_type_list) \
+        return isinstance(layer, utils.out_scale_layers_list) \
             or 'quantized_' in layer.full_name()
 
-    # When inferenc model is saved, the logic in hook would not be executed
-    # in program translation, so that some parameters can not created in
-    # __init__, which would cause the model to fail to save. Therefore, the
-    # parameters creation in the hook is advanced to be exected outside the hook.
-    def _add_new_parameters(self, layer, name=None):
+    def _init_scale_params(self, layer, name=None):
+        """
+        Init the scale params for calculating output scales and save them in the
+        target layer.
+        After the users define the dygraph model, the hooks for calculating output
+        scales will not execute immediately. If the users load the checkpoint now,
+        the scale params have not been created, so them cann't be loaded.
+        Therefore, define the scale params in the beginning.
+        """
+
+        def _create_param(in_layer, first_name, last_name, dtype):
+            prefix = '{}.{}'.format(first_name, last_name) \
+                if first_name else 'outscale.{}'.format(last_name)
+            attr = ParamAttr(
+                name=unique_name.generate(prefix),
+                initializer=Constant(1),
+                trainable=False)
+            param = in_layer.create_parameter(shape=[1], attr=attr, dtype=dtype)
+            return param
+
         dtype = layer._dtype if layer._dtype is not None else "float32"
         if dtype not in ["float32", "float64"]:
             return
-        scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
-        scale_name = unique_name.generate(scale_prefix)
-        scale_attr = ParamAttr(
-            name=scale_name, initializer=Constant(1), trainable=False)
-        layer._quant_out_scale = layer.create_parameter(
-            shape=[1], attr=scale_attr, dtype=dtype)
+
+        layer._quant_out_scale = _create_param(layer, name, "scale", dtype)
         layer._quant_out_scale.stop_gradient = True
 
-        state_prefix = "{}.state".format(name) if name else 'outscale.state'
-        state_attr = ParamAttr(
-            name=unique_name.generate(state_prefix),
-            initializer=Constant(1),
-            trainable=False)
-        layer._quant_out_state = layer.create_parameter(
-            shape=[1], attr=state_attr, dtype=dtype)
+        layer._quant_out_state = _create_param(layer, name, "state", dtype)
         layer._quant_out_state.stop_gradient = True
 
-        accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
-        accum_attr = ParamAttr(
-            name=unique_name.generate(accum_prefix),
-            initializer=Constant(1),
-            trainable=False)
-        layer._quant_out_accum = layer.create_parameter(
-            shape=[1], attr=accum_attr, dtype=dtype)
+        layer._quant_out_accum = _create_param(layer, name, "accum", dtype)
         layer._quant_out_accum.stop_gradient = True
 
     # Judge whether the op in program matches the Layer in dynamic model
@@ -598,20 +587,18 @@ class ImperativeCalcOutputScale(object):
             op_type = op_type.replace('relu', 're_lu')
         return op_type in layer_name
 
-    def _forward_post_hook(self, layer, input, output):
-        assert isinstance(
-            output, (core.VarBase, framework.Variable)
-        ), "Multiple outputs are not currently supported in ImperativeOutScale."
-        if output.dtype not in [
-                core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64
-        ]:
-            return
-        if not hasattr(layer, "_out_scale"):
-            self._out_scale = quant_nn.MovingAverageAbsMaxScale(
-                layer, output.name, self._moving_rate, output.dtype)
-        scale_out = self._out_scale(output)
-        if hasattr(layer, 'layer_name'):
-            layer_name = layer.layer_name
-        else:
-            layer_name = layer.full_name()
-        self._out_scale_dict[layer_name] = scale_out
+    def _calc_output_scale_hook(self, layer, input, output):
+        """
+        Create the MovingAverageAbsMaxScale layer for the target layer if needed.
+        Execute MovingAverageAbsMaxScale layer to calculate the output scale. 
+        """
+        assert isinstance(output, (core.VarBase, framework.Variable)), \
+            "Multiple outputs are not currently supported in ImperativeOutScale."
+
+        fp_types = [core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64]
+        if output.dtype in fp_types:
+            if not hasattr(layer, "_out_scale"):
+                self._out_scale = quant_nn.MovingAverageAbsMaxScale(
+                    layer, output.name, self._moving_rate, output.dtype)
+            # TODO (jc): consider the ops that have several outputs 
+            self._out_scale(output)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index 0b052d5dd0..3c4fb323bc 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -499,6 +499,10 @@ class QuantizedNoweightLayer(layers.Layer):
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
+        # TODO (jc): support ops that have several inputs
+        if isinstance(input, list):
+            assert len(input) == 1, \
+                "The QuantizedNoweightLayer should only have one input."
         return self._layer.forward(quant_input)
 
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index a732181db7..1ff4a408e0 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -12,12 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.nn import Linear, Conv2D
-from paddle.fluid.dygraph.nn import Pool2D
-from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6
-from paddle.nn.layer.activation import Tanh, Softmax, PReLU, Swish
+import paddle
 
-_op_real_in_out_name = {
+op_real_in_out_name = {
     "conv2d": [["Input", "Filter"], ["Output"]],
     "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
     "pool2d": [["X"], ["Out"]],
@@ -33,14 +30,30 @@ _op_real_in_out_name = {
     "swish": [["X"], ["Out"]],
 }
 
-_quant_layers_map = {
-    'Conv2D': Conv2D,
-    'Linear': Linear,
-    'Pool2D': Pool2D,
-    'ReLU': ReLU,
-    'LeakyReLU': LeakyReLU,
-    'ReLU6': ReLU6,
-    'Softmax': Softmax,
-    'Tanh': Tanh,
-    'Swish': Swish
+supported_quant_layers_map = {
+    'Conv2D': paddle.nn.Conv2D,
+    'Linear': paddle.nn.Linear,
+    'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
+    'AdaptiveMaxPool2D': paddle.nn.AdaptiveMaxPool2D,
+    'AvgPool2D': paddle.nn.AvgPool2D,
+    'MaxPool2D': paddle.nn.MaxPool2D,
+    'Hardswish': paddle.nn.Hardswish,
+    'LeakyReLU': paddle.nn.LeakyReLU,
+    'PReLU': paddle.nn.PReLU,
+    'ReLU': paddle.nn.ReLU,
+    'ReLU6': paddle.nn.ReLU6,
+    'Sigmoid': paddle.nn.Sigmoid,
+    'Softmax': paddle.nn.Softmax,
+    'Swish': paddle.nn.Swish,
+    'Tanh': paddle.nn.Tanh,
+    'Hardswish': paddle.nn.Hardswish,
+    'BatchNorm': paddle.nn.BatchNorm,
+    'GroupNorm': paddle.nn.GroupNorm,
+    'LayerNorm': paddle.nn.LayerNorm,
 }
+
+out_scale_layers_list = (
+    paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.MaxPool2D,
+    paddle.nn.BatchNorm, paddle.nn.BatchNorm2D, paddle.nn.SyncBatchNorm,
+    paddle.nn.LeakyReLU, paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6,
+    paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Tanh, paddle.nn.Swish)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
index 9d2b2d726e..d76e4825e0 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
@@ -191,8 +191,8 @@ class TestImperativeAddQuantDequant(unittest.TestCase):
             weight_quantize_type='abs_max',
             activation_quantize_type='moving_average_abs_max',
             quantizable_layer_type=[
-                'Conv2D', 'Linear', 'ReLU', 'Pool2D', 'LeakyReLU', 'ReLU6',
-                'Tanh', 'Swish'
+                'Conv2D', 'Linear', 'ReLU', 'LeakyReLU', 'ReLU6', 'Tanh',
+                'Swish'
             ])
 
         with fluid.dygraph.guard():
-- 
Gitee


From 76ca613d427d8854ace260606ad3bc6c2777d0ef Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 14:31:09 +0800
Subject: [PATCH 19/45] !27 NMS Performance Optimization Merge pull request !27
 from PaddlePaddle-Gardener/nms_mask_opt

---
 paddle/fluid/operators/detection/bbox_util.cu.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 27852d4394..6d271766b0 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -275,15 +275,19 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
 
   const T *boxes = proposals.data<T>();
   auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-  framework::Vector<uint64_t> mask(boxes_num * col_blocks);
-  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes,
-                                 mask.CUDAMutableData(BOOST_GET_CONST(
-                                     platform::CUDAPlace, ctx.GetPlace())),
-                                 pixel_offset);
+  auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t));
+  uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr());
+
+  NMSKernel<<<blocks, threads, 0, ctx.stream()>>>(
+      boxes_num, nms_threshold, boxes, mask_dev, pixel_offset);
 
   std::vector<uint64_t> remv(col_blocks);
   memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
 
+  std::vector<uint64_t> mask_host(boxes_num * col_blocks);
+  memory::Copy(platform::CPUPlace(), mask_host.data(), place, mask_dev,
+               boxes_num * col_blocks * sizeof(uint64_t), ctx.stream());
+
   std::vector<int> keep_vec;
   int num_to_keep = 0;
   for (int i = 0; i < boxes_num; i++) {
@@ -293,7 +297,7 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
     if (!(remv[nblock] & (1ULL << inblock))) {
       ++num_to_keep;
       keep_vec.push_back(i);
-      uint64_t *p = &mask[0] + i * col_blocks;
+      uint64_t *p = mask_host.data() + i * col_blocks;
       for (int j = nblock; j < col_blocks; j++) {
         remv[j] |= p[j];
       }
-- 
Gitee


From aea57b28b981d6d803af071ab5f46cceaf8ceadd Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 14:31:12 +0800
Subject: [PATCH 20/45] !29 [oneDNN] lookup_table op with support for BF16 data
 type. Merge pull request !29 from PaddlePaddle-Gardener/aosewski/bf16_lookup

---
 .../ir/mkldnn/cpu_bfloat16_placement_pass.cc  |   4 +-
 .../ir/mkldnn/cpu_bfloat16_placement_pass.h   |   2 +-
 paddle/fluid/operators/lookup_table_op.cc     |   7 +-
 paddle/fluid/operators/lookup_table_op.h      |   6 +-
 paddle/fluid/operators/math/blas_impl.h       |  11 ++
 .../paddle/fluid/tests/unittests/op_test.py   |  16 +-
 .../unittests/test_lookup_table_bf16_op.py    | 176 ++++++++++++++++++
 tools/static_mode_white_list.py               |   1 +
 8 files changed, 213 insertions(+), 10 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
index 3d7a9c1107..531a04e1a0 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -53,7 +53,7 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
   gpd(graph, handler);
 }
 
-void CPUBfloat16PlacementPass::RemoveOrhanedOperators(
+void CPUBfloat16PlacementPass::RemoveOrphanedOperators(
     ir::Graph* graph, int* bfloat16_operators) const {
   // find orphaned bfloat16 operator that is between two float32 operators
   // revert mkldnn_data_type attr to float32
@@ -74,7 +74,7 @@ void CPUBfloat16PlacementPass::RemoveOrhanedOperators(
 void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
   int bfloat16_operators = 0;
   SetMkldnnDataType(graph, &bfloat16_operators);
-  RemoveOrhanedOperators(graph, &bfloat16_operators);
+  RemoveOrphanedOperators(graph, &bfloat16_operators);
   PrettyLogDetail("---    marked %d operators to bfloat16 ",
                   bfloat16_operators);
 }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
index 1911b1a3cb..53b97f0e97 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
@@ -28,7 +28,7 @@ class CPUBfloat16PlacementPass : public Pass {
  protected:
   void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const;
 
-  void RemoveOrhanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
+  void RemoveOrphanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
 
   void ApplyImpl(ir::Graph* graph) const override;
 };
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 1b482235da..2e8b551ea4 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
 namespace operators {
@@ -222,9 +223,11 @@ REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
 
 REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
                        ops::LookupTableKernel<double>,
-                       ops::LookupTableKernel<int8_t>);
+                       ops::LookupTableKernel<int8_t>,
+                       ops::LookupTableKernel<paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>,
-                       ops::LookupTableGradKernel<double>);
+                       ops::LookupTableGradKernel<double>,
+                       ops::LookupTableGradKernel<paddle::platform::bfloat16>);
 
 /* ==========================  register checkpoint ===========================*/
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 8baa3bccce..e385d72d1f 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -102,7 +102,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
             auto id_index = table_t.GetIndexFromId(ids[i]);
 
             if (id_index != -1) {
-              if (input_data_type == framework::proto::VarType::INT8) {
+              if (input_data_type == framework::proto::VarType::INT8 ||
+                  input_data_type == framework::proto::VarType::BF16) {
                 memcpy(output + i * row_width, table + id_index * row_width,
                        row_width * sizeof(T));
               } else {
@@ -128,7 +129,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
                     "the input key should be exists. But received %d.",
                     id_index));
 
-            if (input_data_type == framework::proto::VarType::INT8) {
+            if (input_data_type == framework::proto::VarType::INT8 ||
+                input_data_type == framework::proto::VarType::BF16) {
               memcpy(output + i * row_width, table + id_index * row_width,
                      row_width * sizeof(T));
             } else {
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 4847c1f05b..64b533de09 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -21,6 +21,7 @@
 #include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 
@@ -40,6 +41,16 @@ struct CBlas<int8_t> {
   }
 };
 
+template <>
+struct CBlas<platform::bfloat16> {
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Blas VCOPY do not supported on CPU with bfloat16,"
+        " please check your code"));
+  }
+};
+
 #ifdef PADDLE_WITH_MKLML
 template <>
 struct CBlas<float> {
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8ca83d08d6..939e2ac0f5 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -33,10 +33,19 @@ from paddle.fluid.backward import append_backward
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, OpProtoHolder, Variable
-from testsuite import create_op, set_input, append_input_output, append_loss_ops
+from paddle.fluid.tests.unittests.testsuite import (
+    create_op,
+    set_input,
+    append_input_output,
+    append_loss_ops, )
 from paddle.fluid import unique_name
-from white_list import op_accuracy_white_list, check_shape_white_list, compile_vs_runtime_white_list, no_check_set_white_list
-from white_list import op_threshold_white_list, no_grad_set_white_list
+from paddle.fluid.tests.unittests.white_list import (
+    op_accuracy_white_list,
+    check_shape_white_list,
+    compile_vs_runtime_white_list,
+    no_check_set_white_list,
+    op_threshold_white_list,
+    no_grad_set_white_list, )
 
 
 def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
@@ -1452,6 +1461,7 @@ class OpTest(unittest.TestCase):
         analytic_grads = self._get_gradient(inputs_to_check, place,
                                             output_names, no_grad_set,
                                             user_defined_grad_outputs)
+
         # comparison of bf16 results will happen as fp32
         # loop over list of grads and convert bf16 to fp32
         fp32_grads = []
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
new file mode 100644
index 0000000000..13c4aa6d76
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
@@ -0,0 +1,176 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float,
+    skip_check_grad_ci)
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle import enable_static
+
+
+def _lookup(weights, ids, flat_ids):
+    w_shape = weights.shape
+    out_shape = list(ids.shape[:-1])
+    out_shape.append(w_shape[-1])
+    out = weights[flat_ids].reshape(out_shape)
+    return out
+
+
+def _get_grad(weights, ids, flat_ids):
+    w_shape = weights.shape
+    w_grad = np.zeros((w_shape), dtype=weights.dtype)
+    out_grad_shape = (np.prod(ids.shape[:-1]), w_shape[-1])
+    out_grad = weights[flat_ids].reshape(out_grad_shape)
+    for i, idx in enumerate(flat_ids):
+        w_grad[idx, :] += out_grad[i]
+    return w_grad
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table"
+        self.dtype = np.uint16
+
+        table = np.random.random((17, 31)).astype("float32")
+        self.ids = np.random.randint(0, 17, (4, 1)).astype("int64")
+        self.flat_ids = self.ids.flatten()
+
+        self.w_bf16 = convert_float_to_uint16(table)
+        self.out_bf16 = _lookup(self.w_bf16, self.ids, self.flat_ids)
+        self.out_fp32 = _lookup(table, self.ids, self.flat_ids)
+        self.w_grad_fp32 = _get_grad(table, self.ids, self.flat_ids)
+
+        self.inputs = {'W': self.w_bf16, 'Ids': self.ids}
+        self.outputs = {'Out': self.out_fp32}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ['W'],
+            'Out',
+            no_grad_set=set('Ids'),
+            check_dygraph=False,
+            max_relative_error=1.5e-2,
+            user_defined_grads=[self.w_grad_fp32],
+            user_defined_grad_outputs=[self.out_bf16])
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16OpIds4D(TestLookupTableBF16Op):
+    def setUp(self):
+        super(TestLookupTableBF16OpIds4D, self).setUp()
+        self.ids = np.random.randint(0, 17, (2, 4, 5, 1)).astype("int64")
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16OpWIsSelectedRows(unittest.TestCase):
+    def setUp(self):
+        self.ids = np.random.randint(
+            low=0, high=15, size=(10, 1)).astype("int64")
+        self.flat_ids = self.ids.flatten()
+        self.w_fp32 = np.random.random((15, 32)).astype("float32")
+        self.w_bf16 = convert_float_to_uint16(self.w_fp32)
+        self.scope = core.Scope()
+        self.place = core.CPUPlace()
+
+    def prepare_w(self):
+        rows = [a for a in range(self.w_bf16.shape[0])]
+        row_numel = self.w_bf16.shape[1]
+
+        w_selected_rows = self.scope.var('W').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(self.w_bf16, self.place)
+
+    def prepare_ids(self):
+        ids_tensor = self.scope.var('Ids').get_tensor()
+        ids_tensor.set(self.ids, self.place)
+
+    def _check_output(self, reference, result_array):
+        result_array_fp32 = convert_uint16_to_float(result_array)
+        np.testing.assert_allclose(result_array_fp32, reference, rtol=1.5e-2)
+
+    def test_check_output(self):
+        self.prepare_ids()
+        self.prepare_w()
+        out_tensor = self.scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
+        lookup_table.run(self.scope, self.place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+        ref = _lookup(self.w_fp32, self.ids, self.flat_ids)
+        self._check_output(ref, result_array)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16OpWIsSelectedRows4DIds(
+        TestLookupTableBF16OpWIsSelectedRows):
+    def setUp(self):
+        super(TestLookupTableBF16OpWIsSelectedRows4DIds, self).setUp()
+        self.ids = np.random.randint(
+            low=0, high=15, size=(3, 4, 5, 1)).astype("int64")
+        self.flat_ids = self.ids.flatten()
+
+
+@skip_check_grad_ci(
+    reason="Since paddings are not trainable and fixed in forward,"
+    "the gradient of paddings makes no sense and we don't "
+    "test the gradient here.")
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16OpWithPadding(TestLookupTableBF16Op):
+    def test_check_output(self):
+        ids = np.squeeze(self.inputs['Ids'])
+        padding_idx = np.random.choice(ids, 1)[0]
+        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
+        self.attrs = {'padding_idx': int(padding_idx)}
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
+
+
+@skip_check_grad_ci(
+    reason="Since paddings are not trainable and fixed in forward,"
+    "the gradient of paddings makes no sense and we don't "
+    "test the gradient here.")
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16OpIds4DPadding(TestLookupTableBF16OpIds4D):
+    def test_check_output(self):
+        ids = self.inputs['Ids']
+        flatten_idx = ids.flatten()
+        padding_idx = np.random.choice(flatten_idx, 1)[0]
+        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
+        self.attrs = {'padding_idx': int(padding_idx)}
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
+
+
+if __name__ == "__main__":
+    enable_static()
+    unittest.main()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index dc537cb268..2ea3f7654a 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -21,6 +21,7 @@ STATIC_MODE_TESTING_LIST = [
     'test_linear_chain_crf_op',
     'test_lod_reset_op',
     'test_lookup_table_op',
+    'test_lookup_table_bf16_op',
     'test_pad2d_op',
     'test_scatter_op',
     'test_sequence_concat',
-- 
Gitee


From 453b4a5b37fad956f77579265f61e6137e36eef4 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 14:31:15 +0800
Subject: [PATCH 21/45] !30 [CustomOp] Support attribute in infershape function
 Merge pull request !30 from
 PaddlePaddle-Gardener/extension/support_attr_for_infershape

---
 .../extension/include/ext_op_meta_info.h      | 112 ++++++++++-----
 paddle/fluid/framework/custom_operator.cc     |  50 ++++++-
 .../fluid/tests/custom_op/custom_concat_op.cc |  90 ++++++++++++
 .../tests/custom_op/test_custom_concat.py     | 128 +++++++++++-------
 4 files changed, 289 insertions(+), 91 deletions(-)

diff --git a/paddle/fluid/extension/include/ext_op_meta_info.h b/paddle/fluid/extension/include/ext_op_meta_info.h
index bad1d6ad9f..c400164c75 100644
--- a/paddle/fluid/extension/include/ext_op_meta_info.h
+++ b/paddle/fluid/extension/include/ext_op_meta_info.h
@@ -204,38 +204,68 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
 // Record Op infershape core function
 using InferShapeFunc = std::vector<std::vector<int64_t>> (*)(
     const std::vector<std::vector<int64_t>>& input_shapes,
-    const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes);
+    const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
+    const std::vector<boost::any>& attrs);
 
-#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)             \
-  template <typename... Tail>                                                \
-  struct InferShapeCallHelper<input_type, Tail...> {                         \
-    template <int in_idx, int vec_in_idx, typename... PreviousArgs>          \
-    static Return InferShape(                                                \
-        const std::vector<std::vector<int64_t>>& input_shapes,               \
-        const std::vector<std::vector<std::vector<int64_t>>>&                \
-            vec_input_shapes,                                                \
-        const PreviousArgs&... pargs) {                                      \
-      input_type arg = input_shapes[in_idx];                                 \
-      return InferShapeCallHelper<Tail...>::template InferShape<in_idx + 1,  \
-                                                                vec_in_idx>( \
-          input_shapes, vec_input_shapes, pargs..., arg);                    \
-    }                                                                        \
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)              \
+  template <typename... Tail>                                                 \
+  struct InferShapeCallHelper<input_type, Tail...> {                          \
+    template <int in_idx, int vec_in_idx, int attr_idx,                       \
+              typename... PreviousArgs>                                       \
+    static Return InferShape(                                                 \
+        const std::vector<std::vector<int64_t>>& input_shapes,                \
+        const std::vector<std::vector<std::vector<int64_t>>>&                 \
+            vec_input_shapes,                                                 \
+        const std::vector<boost::any>& attrs, const PreviousArgs&... pargs) { \
+      input_type arg = input_shapes[in_idx];                                  \
+      return InferShapeCallHelper<Tail...>::template InferShape<              \
+          in_idx + 1, vec_in_idx, attr_idx>(input_shapes, vec_input_shapes,   \
+                                            attrs, pargs..., arg);            \
+    }                                                                         \
   }
 
-#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type)           \
-  template <typename... Tail>                                               \
-  struct InferShapeCallHelper<input_type, Tail...> {                        \
-    template <int in_idx, int vec_in_idx, typename... PreviousArgs>         \
-    static Return InferShape(                                               \
-        const std::vector<std::vector<int64_t>>& input_shapes,              \
-        const std::vector<std::vector<std::vector<int64_t>>>&               \
-            vec_input_shapes,                                               \
-        const PreviousArgs&... pargs) {                                     \
-      input_type arg = vec_input_shapes[vec_in_idx];                        \
-      return InferShapeCallHelper<Tail...>::template InferShape<            \
-          in_idx, vec_in_idx + 1>(input_shapes, vec_input_shapes, pargs..., \
-                                  arg);                                     \
-    }                                                                       \
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type)             \
+  template <typename... Tail>                                                 \
+  struct InferShapeCallHelper<input_type, Tail...> {                          \
+    template <int in_idx, int vec_in_idx, int attr_idx,                       \
+              typename... PreviousArgs>                                       \
+    static Return InferShape(                                                 \
+        const std::vector<std::vector<int64_t>>& input_shapes,                \
+        const std::vector<std::vector<std::vector<int64_t>>>&                 \
+            vec_input_shapes,                                                 \
+        const std::vector<boost::any>& attrs, const PreviousArgs&... pargs) { \
+      input_type arg = vec_input_shapes[vec_in_idx];                          \
+      return InferShapeCallHelper<Tail...>::template InferShape<              \
+          in_idx, vec_in_idx + 1, attr_idx>(input_shapes, vec_input_shapes,   \
+                                            attrs, pargs..., arg);            \
+    }                                                                         \
+  }
+
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(attr_type)                \
+  template <typename... Tail>                                                 \
+  struct InferShapeCallHelper<attr_type, Tail...> {                           \
+    template <int in_idx, int vec_in_idx, int attr_idx,                       \
+              typename... PreviousArgs>                                       \
+    static Return InferShape(                                                 \
+        const std::vector<std::vector<int64_t>>& input_shapes,                \
+        const std::vector<std::vector<std::vector<int64_t>>>&                 \
+            vec_input_shapes,                                                 \
+        const std::vector<boost::any>& attrs, const PreviousArgs&... pargs) { \
+      try {                                                                   \
+        attr_type arg = boost::any_cast<attr_type>(attrs[attr_idx]);          \
+        return InferShapeCallHelper<Tail...>::template InferShape<            \
+            in_idx, vec_in_idx, attr_idx + 1>(input_shapes, vec_input_shapes, \
+                                              attrs, pargs..., arg);          \
+      } catch (boost::bad_any_cast&) {                                        \
+        PD_THROW(                                                             \
+            "Attribute cast error in custom operator InferShapeFn. "          \
+            "Expected " #attr_type                                            \
+            " value. InferShapeFn's attribute list must be exactly same as "  \
+            "Forward "                                                        \
+            "KernelFn's attribute list except std::vector<int64_t> "          \
+            "attribute.");                                                    \
+      }                                                                       \
+    }                                                                         \
   }
 
 template <typename F, F f>
@@ -245,10 +275,10 @@ template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
 struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
   static Return InferShape(
       const std::vector<std::vector<int64_t>>& input_shapes,
-      const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes) {
-    return InferShapeCallHelper<Args..., TypeTag<int>>::template InferShape<0,
-                                                                            0>(
-        input_shapes, vec_input_shapes);
+      const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
+      const std::vector<boost::any>& attrs) {
+    return InferShapeCallHelper<Args..., TypeTag<int>>::template InferShape<
+        0, 0, 0>(input_shapes, vec_input_shapes, attrs);
   }
 
  private:
@@ -265,14 +295,26 @@ struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
   PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(
       std::vector<std::vector<int64_t>>);
 
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const bool&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const int&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const float&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const int64_t&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::string&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector<int>&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector<float>&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector<std::string>&);
+  // NOTE(chenweihang): InferShape can't support std::vector<int64_t> attr type,
+  // because the input type is std::vector<int64_t>, only can use one rule to
+  // parse std::vector<int64_t> parameter
+
   // end: base template
   template <typename T>
   struct InferShapeCallHelper<TypeTag<T>> {
-    template <int in_idx, int vec_in_idx>
+    template <int in_idx, int vec_in_idx, int attr_idx>
     static Return InferShape(
         const std::vector<std::vector<int64_t>>& input_shapes,
         const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
-        const Args&... args) {
+        const std::vector<boost::any>& attrs, const Args&... args) {
       return impl_fn(args...);
     }
   };
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 69a9be603e..1ebb8998c8 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -178,7 +178,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
           "Unsupported `%s` type value as custom attribute now. "
           "Supported data types include `bool`, `int`, `float`, "
           "`int64_t`, `std::string`, `std::vector<int>`, "
-          "`std::vector<float>`, `std::vector<int64_t>, "
+          "`std::vector<float>`, `std::vector<int64_t>`, "
           "`std::vector<std::string>`, Please check whether "
           "the attribute data type and data type string are matched.",
           attr_type_str));
@@ -327,7 +327,7 @@ class CustomOpMaker : public OpProtoAndCheckerMaker {
             "Unsupported `%s` type value as custom attribute now. "
             "Supported data types include `bool`, `int`, `float`, "
             "`int64_t`, `std::string`, `std::vector<int>`, "
-            "`std::vector<float>`, `std::vector<int64_t>, "
+            "`std::vector<float>`, `std::vector<int64_t>`, "
             "`std::vector<std::string>`, Please check whether "
             "the attribute data type and data type string are matched.",
             attr_type_str));
@@ -581,7 +581,7 @@ void RegisterOperatorWithMetaInfo(
       ctx->ShareDim(op_inputs[0], op_outputs[0]);
     };
   } else {
-    info.infer_shape_ = [op_inputs, op_outputs,
+    info.infer_shape_ = [op_inputs, op_outputs, op_attrs,
                          infer_shape_func](InferShapeContext* ctx) {
       std::vector<std::vector<int64_t>> input_shapes;
       std::vector<std::vector<std::vector<int64_t>>> vec_input_shapes;
@@ -606,8 +606,50 @@ void RegisterOperatorWithMetaInfo(
         }
       }
 
+      std::vector<boost::any> custom_attrs;
+      for (auto& attr_str : op_attrs) {
+        auto attr_name_and_type = detail::ParseAttrStr(attr_str);
+        auto attr_name = attr_name_and_type[0];
+        auto attr_type_str = attr_name_and_type[1];
+        if (attr_type_str == "bool") {
+          custom_attrs.emplace_back(ctx->Attrs().Get<bool>(attr_name));
+        } else if (attr_type_str == "int") {
+          custom_attrs.emplace_back(ctx->Attrs().Get<int>(attr_name));
+        } else if (attr_type_str == "float") {
+          custom_attrs.emplace_back(ctx->Attrs().Get<float>(attr_name));
+        } else if (attr_type_str == "int64_t") {
+          custom_attrs.emplace_back(ctx->Attrs().Get<int64_t>(attr_name));
+        } else if (attr_type_str == "std::string") {
+          custom_attrs.emplace_back(ctx->Attrs().Get<std::string>(attr_name));
+        } else if (attr_type_str == "std::vector<int>") {
+          custom_attrs.emplace_back(
+              ctx->Attrs().Get<std::vector<int>>(attr_name));
+        } else if (attr_type_str == "std::vector<float>") {
+          custom_attrs.emplace_back(
+              ctx->Attrs().Get<std::vector<float>>(attr_name));
+        } else if (attr_type_str == "std::vector<int64_t>") {
+          // NOTE(chenweihang): InferShape can't support std::vector<int64_t>
+          // attr type, because the input type is std::vector<int64_t>, only
+          // can use one rule to parse std::vector<int64_t> parameter
+          continue;
+        } else if (attr_type_str == "std::vector<std::string>") {
+          custom_attrs.emplace_back(
+              ctx->Attrs().Get<std::vector<std::string>>(attr_name));
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported `%s` type value as custom attribute now. "
+              "Supported data types include `bool`, `int`, `float`, "
+              "`int64_t`, `std::string`, `std::vector<int>`, "
+              "`std::vector<float>`, `std::vector<std::string>`, "
+              "Please check whether the attribute data type and "
+              "data type string are matched.",
+              attr_type_str));
+        }
+      }
+
       VLOG(1) << "Custom Operator: InferShape - calc output ddim.";
-      auto output_shapes = infer_shape_func(input_shapes, vec_input_shapes);
+      auto output_shapes =
+          infer_shape_func(input_shapes, vec_input_shapes, custom_attrs);
 
       VLOG(1) << "Custom Operator: InferShape - set output ddim.";
       for (size_t i = 0; i < op_outputs.size(); ++i) {
diff --git a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
index 2d8d0ccb88..a01e01f2bc 100644
--- a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
@@ -144,3 +144,93 @@ PD_BUILD_GRAD_OP(custom_concat)
     .Inputs({paddle::Vec("X"), paddle::Grad("Out"), "Axis"})
     .Outputs({paddle::Grad(paddle::Vec("X"))})
     .SetKernelFn(PD_KERNEL(ConcatBackwardDynamicAxis));
+
+std::vector<paddle::Tensor> ConcatForwardStaticAxis(
+    const std::vector<paddle::Tensor>& inputs, const int64_t& axis) {
+  // check inputs
+  PD_CHECK(inputs.size() >= 1, "No Tensor need to be concat.");
+  for (auto& t : inputs) {
+    CHECK_INPUT(t);
+  }
+
+  // compute output shape
+  int64_t rank = static_cast<int64_t>(inputs[0].shape().size());
+  auto final_axis = ComputeAxis(axis, rank);
+  std::vector<std::vector<int64_t>> in_shapes;
+  for (auto& t : inputs) {
+    in_shapes.emplace_back(t.shape());
+  }
+  auto out_shape = ComputeOutShape(in_shapes, final_axis);
+
+  // create output
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(out_shape);
+
+  // calc
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
+      inputs[0].type(), "ConcatCpuKernel", ([&] {
+        ConcatCpuKernel<data_t>(inputs, &out, final_axis);
+      }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> ConcatBackwardStaticAxis(
+    const std::vector<paddle::Tensor>& inputs,
+    const paddle::Tensor& grad_out,
+    const int64_t& axis) {
+  // check input
+  PD_CHECK(inputs.size() >= 1, "No Tensor need to be concat.");
+  for (auto& t : inputs) {
+    CHECK_INPUT(t);
+  }
+  CHECK_INPUT(grad_out);
+
+  // compate axis
+  int64_t rank = static_cast<int64_t>(inputs[0].shape().size());
+  auto final_axis = ComputeAxis(axis, rank);
+
+  // create outputs
+  std::vector<paddle::Tensor> grad_inputs;
+  for (auto& t : inputs) {
+    auto grad = paddle::Tensor(paddle::PlaceType::kCPU);
+    grad.reshape(t.shape());
+    grad_inputs.emplace_back(grad);
+  }
+
+  // calc
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
+      grad_out.type(), "SplitCpuKernel", ([&] {
+        SplitCpuKernel<data_t>(grad_out, inputs, &grad_inputs, final_axis);
+      }));
+
+  return grad_inputs;
+}
+
+std::vector<std::vector<int64_t>> ConcatInferShapeStaticAxis(
+    const std::vector<std::vector<int64_t>>& input_shapes,
+    const int64_t& axis) {
+  int64_t rank = static_cast<int64_t>(input_shapes[0].size());
+  auto final_axis = ComputeAxis(axis, rank);
+  auto out_shape = ComputeOutShape(input_shapes, final_axis);
+  return {out_shape};
+}
+
+std::vector<paddle::DataType> ConcatInferDtypeStaticAxis(
+    const std::vector<paddle::DataType>& input_dtypes) {
+  return {input_dtypes[0]};
+}
+
+PD_BUILD_OP(custom_concat_with_attr)
+    .Inputs({paddle::Vec("X")})
+    .Outputs({"Out"})
+    .Attrs({"axis: int64_t"})
+    .SetKernelFn(PD_KERNEL(ConcatForwardStaticAxis))
+    .SetInferShapeFn(PD_INFER_SHAPE(ConcatInferShapeStaticAxis))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ConcatInferDtypeStaticAxis));
+
+PD_BUILD_GRAD_OP(custom_concat_with_attr)
+    .Inputs({paddle::Vec("X"), paddle::Grad("Out")})
+    .Outputs({paddle::Grad(paddle::Vec("X"))})
+    .Attrs({"axis: int64_t"})
+    .SetKernelFn(PD_KERNEL(ConcatBackwardStaticAxis));
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
index 4086224cd7..ea41126c1c 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
@@ -45,14 +45,16 @@ custom_ops = load(
     verbose=True)
 
 
-def concat_dynamic(func, device, dtype, np_inputs, axis_v):
-    paddle.set_device(device)
+def concat_dynamic(func, dtype, np_inputs, axis_v, with_attr=False):
+    paddle.set_device("cpu")
     inputs = [
         paddle.to_tensor(
-            x, dtype=dtype, place=device, stop_gradient=False)
-        for x in np_inputs
+            x, dtype=dtype, stop_gradient=False) for x in np_inputs
     ]
-    axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v)
+    if with_attr:
+        axis = axis_v
+    else:
+        axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v)
     out = func(inputs, axis)
     out.stop_gradient = False
     out.backward()
@@ -60,14 +62,17 @@ def concat_dynamic(func, device, dtype, np_inputs, axis_v):
     return out.numpy(), grad_inputs
 
 
-def concat_static(func, device, dtype, np_inputs, axis_v):
+def concat_static(func, dtype, np_inputs, axis_v, with_attr=False):
     paddle.enable_static()
-    paddle.set_device(device)
+    paddle.set_device("cpu")
     with static.scope_guard(static.Scope()):
         with static.program_guard(static.Program()):
             x1 = static.data(name="x1", shape=[2, 3], dtype=dtype)
             x2 = static.data(name="x2", shape=[2, 3], dtype=dtype)
-            axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v)
+            if with_attr:
+                axis = axis_v
+            else:
+                axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v)
             x1.stop_gradient = False
             x2.stop_gradient = False
             out = func([x1, x2], axis)
@@ -78,13 +83,20 @@ def concat_static(func, device, dtype, np_inputs, axis_v):
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
-            out_v, x1_grad_v, x2_grad_v = exe.run(
-                static.default_main_program(),
-                feed={
+            if with_attr:
+                feed_dict = {
+                    "x1": np_inputs[0].astype(dtype),
+                    "x2": np_inputs[1].astype(dtype)
+                }
+            else:
+                feed_dict = {
                     "x1": np_inputs[0].astype(dtype),
                     "x2": np_inputs[1].astype(dtype),
                     "axis": axis
-                },
+                }
+            out_v, x1_grad_v, x2_grad_v = exe.run(
+                static.default_main_program(),
+                feed=feed_dict,
                 fetch_list=[out.name, x1.name + "@GRAD", x2.name + "@GRAD"])
     paddle.disable_static()
     return out_v, x1_grad_v, x2_grad_v
@@ -93,55 +105,67 @@ def concat_static(func, device, dtype, np_inputs, axis_v):
 class TestCustomConcatDynamicAxisJit(unittest.TestCase):
     def setUp(self):
         self.dtypes = ['float32', 'float64', 'int32', 'int64']
-        self.devices = ['cpu']
         self.np_inputs = [
             np.array([[1, 2, 3], [4, 5, 6]]),
             np.array([[11, 12, 13], [14, 15, 16]])
         ]
         self.axises = [0, 1]
 
+    def check_output(self, out, pd_out, name):
+        self.assertTrue(
+            np.array_equal(out, pd_out),
+            "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
+                                                           pd_out))
+
     def test_dynamic(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                for axis in self.axises:
-                    out, grad_inputs = concat_dynamic(custom_ops.custom_concat,
-                                                      device, dtype,
-                                                      self.np_inputs, axis)
-                    pd_out, pd_grad_inputs = concat_dynamic(
-                        paddle.concat, device, dtype, self.np_inputs, axis)
-
-                    self.assertTrue(
-                        np.array_equal(out, pd_out),
-                        "custom op out: {},\n paddle api out: {}".format(
-                            out, pd_out))
-                    for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
-                        self.assertTrue(
-                            np.array_equal(x_grad, pd_x_grad),
-                            "custom op x grad: {},\n paddle api x grad: {}".
-                            format(x_grad, pd_x_grad))
+        for dtype in self.dtypes:
+            for axis in self.axises:
+                out, grad_inputs = concat_dynamic(custom_ops.custom_concat,
+                                                  dtype, self.np_inputs, axis)
+                pd_out, pd_grad_inputs = concat_dynamic(paddle.concat, dtype,
+                                                        self.np_inputs, axis)
+
+                self.check_output(out, pd_out, "out")
+                for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
+                    self.check_output(x_grad, pd_x_grad, "x_grad")
 
     def test_static(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                for axis in self.axises:
-                    out, x1_grad, x2_grad = concat_static(
-                        custom_ops.custom_concat, device, dtype, self.np_inputs,
-                        axis)
-                    pd_out, pd_x1_grad, pd_x2_grad = concat_static(
-                        paddle.concat, device, dtype, self.np_inputs, axis)
-
-                    self.assertTrue(
-                        np.array_equal(out, pd_out),
-                        "custom op out: {},\n paddle api out: {}".format(
-                            out, pd_out))
-                    self.assertTrue(
-                        np.array_equal(x1_grad, pd_x1_grad),
-                        "custom op x1_grad: {},\n paddle api x1_grad: {}".
-                        format(x1_grad, pd_x1_grad))
-                    self.assertTrue(
-                        np.array_equal(x2_grad, pd_x2_grad),
-                        "custom op x2_grad: {},\n paddle api x2_grad: {}".
-                        format(x2_grad, pd_x2_grad))
+        for dtype in self.dtypes:
+            for axis in self.axises:
+                out, x1_grad, x2_grad = concat_static(
+                    custom_ops.custom_concat, dtype, self.np_inputs, axis)
+                pd_out, pd_x1_grad, pd_x2_grad = concat_static(
+                    paddle.concat, dtype, self.np_inputs, axis)
+
+                self.check_output(out, pd_out, "out")
+                self.check_output(x1_grad, pd_x1_grad, "x1_grad")
+                self.check_output(x2_grad, pd_x2_grad, "x2_grad")
+
+    def test_dynamic_with_attr(self):
+        for dtype in self.dtypes:
+            for axis in self.axises:
+                out, grad_inputs = concat_dynamic(
+                    custom_ops.custom_concat_with_attr, dtype, self.np_inputs,
+                    axis, True)
+                pd_out, pd_grad_inputs = concat_dynamic(
+                    paddle.concat, dtype, self.np_inputs, axis, True)
+
+                self.check_output(out, pd_out, "out")
+                for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
+                    self.check_output(x_grad, pd_x_grad, "x_grad")
+
+    def test_static_with_attr(self):
+        for dtype in self.dtypes:
+            for axis in self.axises:
+                out, x1_grad, x2_grad = concat_static(
+                    custom_ops.custom_concat_with_attr, dtype, self.np_inputs,
+                    axis, True)
+                pd_out, pd_x1_grad, pd_x2_grad = concat_static(
+                    paddle.concat, dtype, self.np_inputs, axis, True)
+
+                self.check_output(out, pd_out, "out")
+                self.check_output(x1_grad, pd_x1_grad, "x1_grad")
+                self.check_output(x2_grad, pd_x2_grad, "x2_grad")
 
 
 if __name__ == "__main__":
-- 
Gitee


From 61236925971782121683ec69b247fa0db0b0fa67 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 14:31:18 +0800
Subject: [PATCH 22/45] !31 Run radix sort of proposals layers on the context
 stream Merge pull request !31 from PaddlePaddle-Gardener/radix_sort_stream

---
 .../fluid/operators/detection/bbox_util.cu.h  |  5 ++--
 .../detection/collect_fpn_proposals_op.cu     | 15 ++++++-----
 .../detection/distribute_fpn_proposals_op.cu  | 26 +++++++++----------
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 6d271766b0..725983f815 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -66,7 +66,8 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
   // Determine temporary device storage requirements
   size_t temp_storage_bytes = 0;
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
-      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
+      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num, 0,
+      sizeof(T) * 8, ctx.stream());
   // Allocate temporary storage
   auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
@@ -74,7 +75,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
   // Run sorting operation
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
       d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
-      idx_out, num);
+      idx_out, num, 0, sizeof(T) * 8, ctx.stream());
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index bc74c80e03..ffd9ac6b2a 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -144,7 +144,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     size_t temp_storage_bytes = 0;
     cub::DeviceRadixSort::SortPairsDescending<T, int>(
         nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
-        idx_out, total_roi_num);
+        idx_out, total_roi_num, 0, sizeof(T) * 8, dev_ctx.stream());
     // Allocate temporary storage
     auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
@@ -152,7 +152,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     // sort score to get corresponding index
     cub::DeviceRadixSort::SortPairsDescending<T, int>(
         d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
-        keys_out, idx_in, idx_out, total_roi_num);
+        keys_out, idx_in, idx_out, total_roi_num, 0, sizeof(T) * 8,
+        dev_ctx.stream());
     index_out_t.Resize({real_post_num});
     Tensor sorted_rois;
     sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
@@ -176,7 +177,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     temp_storage_bytes = 0;
     cub::DeviceRadixSort::SortPairs<int, int>(
         nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
-        batch_idx_in, index_out_t.data<int>(), real_post_num);
+        batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
+        sizeof(int) * 8, dev_ctx.stream());
     // Allocate temporary storage
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
@@ -184,7 +186,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     // sort batch_id to get corresponding index
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
-        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
+        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
+        sizeof(int) * 8, dev_ctx.stream());
 
     GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
 
@@ -198,8 +201,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     int threads = kNumCUDAThreads;
 
     // get length-based lod by batch ids
-    GetLengthLoD<<<blocks, threads>>>(real_post_num, out_id_data,
-                                      length_lod_data);
+    GetLengthLoD<<<blocks, threads, 0, dev_ctx.stream()>>>(
+        real_post_num, out_id_data, length_lod_data);
     std::vector<int> length_lod_cpu(lod_size);
     memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place,
                  length_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index cc61035309..7ccb354e17 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     int dist_blocks = NumBlocks(roi_num);
     int threads = kNumCUDAThreads;
     // get target levels and sub_lod list
-    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads>>>(
+    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads, 0, dev_ctx.stream()>>>(
         roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
         max_level, min_level, roi_batch_id_list_gpu.data<int>(),
         sub_lod_list_data, target_lvls_data, pixel_offset);
-    dev_ctx.Wait();
     auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
 
     Tensor index_in_t;
@@ -150,9 +149,9 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     // Determine temporary device storage requirements
     size_t temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
-                                              target_lvls_data, keys_out,
-                                              idx_in, idx_out, roi_num);
+    cub::DeviceRadixSort::SortPairs<int, int>(
+        nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in,
+        idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream());
     // Allocate temporary storage
     auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
@@ -160,29 +159,30 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     // sort target level to get corresponding index
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
-        idx_in, idx_out, roi_num);
+        idx_in, idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream());
 
     int* restore_idx_data =
         restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
     // sort current index to get restore index
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
-        restore_idx_data, roi_num);
+        restore_idx_data, roi_num, 0, sizeof(int) * 8, dev_ctx.stream());
 
     int start = 0;
     auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
 
+    std::vector<int> sub_lod_list_cpu(lod_size * num_level);
+    memory::Copy(platform::CPUPlace(), sub_lod_list_cpu.data(), place,
+                 sub_lod_list_data, sizeof(int) * lod_size * num_level,
+                 dev_ctx.stream());
+    dev_ctx.Wait();
+
     for (int i = 0; i < num_level; ++i) {
       Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
-      int* sub_lod_data = sub_lod.data<int>();
       // transfer length-based lod to offset-based lod
       std::vector<size_t> offset(1, 0);
-      std::vector<int> sub_lod_cpu(lod_size);
-      memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place,
-                   sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
-      dev_ctx.Wait();
       for (int j = 0; j < lod_size; ++j) {
-        offset.emplace_back(offset.back() + sub_lod_cpu[j]);
+        offset.emplace_back(offset.back() + sub_lod_list_cpu[i * lod_size + j]);
       }
 
       int sub_rois_num = offset.back();
-- 
Gitee


From 542207fb94e99dbe3e3b1b7ca06d958486a2e964 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 14:31:22 +0800
Subject: [PATCH 23/45] !32 [ROCM] fix test_rnn_op Merge pull request !32 from
 PaddlePaddle-Gardener/bugfix_rocm_rnn

---
 paddle/fluid/operators/rnn_op.cu.cc           |  7 ++---
 paddle/fluid/platform/dynload/miopen.h        |  1 +
 .../fluid/tests/unittests/test_rnn_op.py      | 27 ++++++++++++++++---
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc
index ccf619a074..2be59c6204 100644
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@@ -117,10 +117,11 @@ class RNNDescriptors {
 
 // ------------------- cudnn rnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor(
-        rnn_desc_.desc(), hidden_size_, num_layers_, miopenRNNlinear,
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
+        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
+        miopenRNNlinear,
         is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, mode_,
-        miopenRNNNoBias, miopenRNNdefault, cudnn_type));
+        miopenRNNwithBias, miopenRNNdefault, cudnn_type));
 #elif CUDNN_VERSION >= 6000
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_.desc(), hidden_size_, num_layers_,
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index 43a3e1a107..15de4c64e3 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -125,6 +125,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(miopenCreateRNNDescriptor);                     \
   __macro(miopenDestroyRNNDescriptor);                    \
   __macro(miopenSetRNNDescriptor);                        \
+  __macro(miopenSetRNNDescriptor_V2);                     \
   __macro(miopenGetRNNParamsSize);                        \
   __macro(miopenGetRNNWorkspaceSize);                     \
   __macro(miopenGetRNNTrainingReserveSize);               \
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_op.py b/python/paddle/fluid/tests/unittests/test_rnn_op.py
index 5ad2ffec98..22e07b0bc4 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_op.py
@@ -47,8 +47,10 @@ class TestRNNOp(OpTest):
 
     def setUp(self):
         self.op_type = "rnn"
-        self.dtype = np.float64
-        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.sequence_length = None if core.is_compiled_with_rocm(
+        ) else np.array(
+            [12, 11, 10, 9, 8], dtype=np.int32)
         self.num_layers = 1
         self.is_bidirec = False
         self.mode = "LSTM"
@@ -78,12 +80,31 @@ class TestRNNOp(OpTest):
             num_layers=self.num_layers,
             time_major=True,
             direction=direction,
-            dropout=self.dropout)
+            dropout=self.dropout,
+            dtype=self.dtype)
 
         flat_w = get_params_for_net(rnn1)
         output, (last_hidden, last_cell) = rnn1(
             input, sequence_length=self.sequence_length)
 
+        if core.is_compiled_with_rocm():
+
+            def rocm_rnn_get_place():
+                places = [core.CUDAPlace(0)]
+                return places
+
+            self._get_places = rocm_rnn_get_place
+
+            if self.is_bidirec:
+                for i in range(0, len(flat_w), 4):
+                    flat_w[i + 1], flat_w[i + 2] = flat_w[i + 2], flat_w[i + 1]
+
+            for i in range(len(flat_w)):
+                w = np.split(flat_w[i][1], 4, 0)
+                w = [w[0], w[1], w[3], w[2]]
+                w = np.concatenate(w)
+                flat_w[i] = (flat_w[i][0], w)
+
         init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
                            hidden_size)).astype(self.dtype)
         init_c = np.zeros((self.num_layers * self.direction_num, batch_size,
-- 
Gitee


From 5e0f0ffd745c9e6c1598b38e08801f3397ec5476 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 14:31:25 +0800
Subject: [PATCH 24/45] !33 [CustomOp] Support float16 in custom op Merge pull
 request !33 from PaddlePaddle-Gardener/extension/support_float16_in_custom_op

---
 cmake/inference_lib.cmake                     |  3 +++
 paddle/fluid/extension/include/ext_dispatch.h | 16 +++++++++++++++
 paddle/fluid/extension/include/ext_dtype.h    |  6 ++++++
 paddle/fluid/extension/src/ext_tensor.cc      | 16 +++++++++++++++
 paddle/fluid/framework/custom_tensor_test.cc  | 11 ++++++++++
 paddle/fluid/framework/custom_tensor_utils.h  |  4 ++++
 .../fluid/tests/custom_op/CMakeLists.txt      | 13 ++----------
 .../fluid/tests/custom_op/custom_relu_op.cu   |  9 +++++----
 .../fluid/tests/custom_op/dispatch_test_op.cc | 18 +++++++++++++++++
 .../custom_op/test_custom_relu_op_jit.py      | 20 +++++++++++++------
 .../custom_op/test_custom_relu_op_setup.py    | 14 +++++++++++--
 .../tests/custom_op/test_dispatch_jit.py      |  6 ++++++
 python/setup.py.in                            |  9 +++------
 13 files changed, 116 insertions(+), 29 deletions(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 570b37ff11..4864e04fa0 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -198,6 +198,9 @@ copy(inference_lib_dist
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
diff --git a/paddle/fluid/extension/include/ext_dispatch.h b/paddle/fluid/extension/include/ext_dispatch.h
index 7b3893e283..9b3e199708 100644
--- a/paddle/fluid/extension/include/ext_dispatch.h
+++ b/paddle/fluid/extension/include/ext_dispatch.h
@@ -47,6 +47,22 @@ namespace paddle {
     }                                                                     \
   }()
 
+#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...)                   \
+  [&] {                                                                        \
+    const auto& __dtype__ = TYPE;                                              \
+    switch (__dtype__) {                                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,           \
+                           __VA_ARGS__)                                        \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,          \
+                           __VA_ARGS__)                                        \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT16, paddle::float16, \
+                           __VA_ARGS__)                                        \
+      default:                                                                 \
+        PD_THROW("function " #NAME " is not implemented for data type `",      \
+                 ::paddle::ToString(__dtype__), "`");                          \
+    }                                                                          \
+  }()
+
 ///////// Integral Dispatch Marco ///////////
 
 #define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)                           \
diff --git a/paddle/fluid/extension/include/ext_dtype.h b/paddle/fluid/extension/include/ext_dtype.h
index a1e58fbacd..3890631a6f 100644
--- a/paddle/fluid/extension/include/ext_dtype.h
+++ b/paddle/fluid/extension/include/ext_dtype.h
@@ -19,11 +19,13 @@ limitations under the License. */
 #include "complex128.h"     // NOLINT
 #include "complex64.h"      // NOLINT
 #include "ext_exception.h"  // NOLINT
+#include "float16.h"        // NOLINT
 
 namespace paddle {
 
 using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
+using float16 = paddle::platform::float16;
 
 enum class DataType {
   BOOL,
@@ -32,6 +34,7 @@ enum class DataType {
   INT16,
   INT32,
   INT64,
+  FLOAT16,
   FLOAT32,
   FLOAT64,
   COMPLEX64,
@@ -53,6 +56,8 @@ inline std::string ToString(DataType dtype) {
       return "int32_t";
     case DataType::INT64:
       return "int64_t";
+    case DataType::FLOAT16:
+      return "float16";
     case DataType::FLOAT32:
       return "float";
     case DataType::FLOAT64:
@@ -73,6 +78,7 @@ inline std::string ToString(DataType dtype) {
   _(int16_t, DataType::INT16)       \
   _(int, DataType::INT32)           \
   _(int64_t, DataType::INT64)       \
+  _(float16, DataType::FLOAT16)     \
   _(float, DataType::FLOAT32)       \
   _(double, DataType::FLOAT64)      \
   _(complex64, DataType::COMPLEX64) \
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
index cb37bf180c..0cae8f4af7 100644
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
@@ -170,6 +171,8 @@ DataType Tensor::type() const {
     return DataType::COMPLEX64;
   } else if (type == framework::proto::VarType::COMPLEX128) {
     return DataType::COMPLEX128;
+  } else if (type == framework::proto::VarType::FP16) {
+    return DataType::FLOAT16;
   }
   // TODO(JiabinYang) Support more dtype here
   return DataType::FLOAT32;
@@ -229,6 +232,8 @@ template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
     const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
     const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
 
 template PD_DLL_DECL float *Tensor::data<float>() const;
 template PD_DLL_DECL double *Tensor::data<double>() const;
@@ -242,6 +247,8 @@ template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::data<paddle::platform::complex64>() const;
 template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::data<paddle::platform::complex128>() const;
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::data<paddle::platform::float16>() const;
 
 template PD_DLL_DECL float *Tensor::mutable_data<float>();
 template PD_DLL_DECL double *Tensor::mutable_data<double>();
@@ -255,6 +262,8 @@ template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>();
 template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>();
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>();
 
 template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
 template PD_DLL_DECL double *Tensor::mutable_data<double>(
@@ -274,6 +283,8 @@ template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
 template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
 
 std::vector<int64_t> Tensor::shape() const {
   GET_CASTED_TENSOR
@@ -344,6 +355,11 @@ Tensor Tensor::cast(const DataType &target_type) const {
                                CastDataType<paddle::platform::complex128>(
                                    *tensor, rlt_tensor_, ctx));
       break;
+    case framework::proto::VarType::FP16:
+      framework::VisitDataType(
+          dst_type,
+          CastDataType<paddle::platform::float16>(*tensor, rlt_tensor_, ctx));
+      break;
     // TODO(JiabinYang) Support more dtype here
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 7da5658860..8d6fd4efd5 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -113,6 +113,8 @@ void GroupTestCopy() {
   TestCopyTensor<paddle::complex64>();
   VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<paddle::complex128>();
+  VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::float16>();
 }
 
 void GroupTestCast() {
@@ -134,6 +136,8 @@ void GroupTestCast() {
   TestCast<paddle::complex64>(paddle::DataType::FLOAT32);
   VLOG(2) << "complex128 cast";
   TestCast<paddle::complex128>(paddle::DataType::FLOAT32);
+  VLOG(2) << "float16 cast";
+  TestCast<paddle::float16>(paddle::DataType::FLOAT16);
 }
 
 void GroupTestDtype() {
@@ -146,6 +150,7 @@ void GroupTestDtype() {
   CHECK(TestDtype<uint8_t>() == paddle::DataType::UINT8);
   CHECK(TestDtype<paddle::complex64>() == paddle::DataType::COMPLEX64);
   CHECK(TestDtype<paddle::complex128>() == paddle::DataType::COMPLEX128);
+  CHECK(TestDtype<paddle::float16>() == paddle::DataType::FLOAT16);
 }
 
 void GroupTestDtypeConvert() {
@@ -178,6 +183,9 @@ void GroupTestDtypeConvert() {
   CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
             paddle::DataType::COMPLEX128) ==
         paddle::framework::proto::VarType::COMPLEX128);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::FLOAT16) ==
+        paddle::framework::proto::VarType::FP16);
   // proto -> enum
   CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
             paddle::framework::proto::VarType::FP64) ==
@@ -207,6 +215,9 @@ void GroupTestDtypeConvert() {
   CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
             paddle::framework::proto::VarType::COMPLEX128) ==
         paddle::DataType::COMPLEX128);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::FP16) ==
+        paddle::DataType::FLOAT16);
 }
 
 TEST(CustomTensor, copyTest) {
diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h
index a252d6aef4..fad1e3ee34 100644
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -60,6 +60,8 @@ class CustomTensorUtils {
         return framework::proto::VarType::COMPLEX64;
       case paddle::DataType::COMPLEX128:
         return framework::proto::VarType::COMPLEX128;
+      case paddle::DataType::FLOAT16:
+        return framework::proto::VarType::FP16;
       case paddle::DataType::BOOL:
         return framework::proto::VarType::BOOL;
       default:
@@ -91,6 +93,8 @@ class CustomTensorUtils {
         return paddle::DataType::COMPLEX64;
       case framework::proto::VarType::COMPLEX128:
         return paddle::DataType::COMPLEX128;
+      case framework::proto::VarType::FP16:
+        return paddle::DataType::FLOAT16;
       case framework::proto::VarType::BOOL:
         return paddle::DataType::BOOL;
       default:
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index 4ba537930c..36496ec499 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -13,24 +13,15 @@ endif()
 
 py_test(test_sysconfig SRCS test_sysconfig.py)
 
-# 'test_dispatch' compile .cc file
+# CPU custom op tests: only compile .cc file
 py_test(test_dispatch_jit SRCS test_dispatch_jit.py)
-set_tests_properties(test_dispatch_jit PROPERTIES TIMEOUT 120)
-
 py_test(test_multi_out_jit SRCS test_multi_out_jit.py)
-set_tests_properties(test_multi_out_jit PROPERTIES TIMEOUT 120)
-
 py_test(test_custom_attrs_jit SRCS test_custom_attrs_jit.py)
-set_tests_properties(test_custom_attrs_jit PROPERTIES TIMEOUT 120)
-
 py_test(test_custom_concat SRCS test_custom_concat.py)
-set_tests_properties(test_custom_concat PROPERTIES TIMEOUT 120)
-
 py_test(test_custom_conj SRCS test_custom_conj.py)
-set_tests_properties(test_custom_conj PROPERTIES TIMEOUT 120)
 
+# other tests
 py_test(test_check_abi SRCS test_check_abi.py)
-
 cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
 
 if(NOT LINUX)
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index be3309d84f..4ec7d08845 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -20,7 +20,7 @@ __global__ void relu_cuda_forward_kernel(const data_t* x,
                                          const int num) {
   int gid = blockIdx.x * blockDim.x + threadIdx.x;
   for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    y[i] = max(x[i], static_cast<data_t>(0.));
+    y[i] = x[i] > static_cast<data_t>(0.) ? x[i] : static_cast<data_t>(0.);
   }
 }
 
@@ -31,7 +31,8 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy,
                                           const int num) {
   int gid = blockIdx.x * blockDim.x + threadIdx.x;
   for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
+    dx[i] = dy[i] * (y[i] > static_cast<data_t>(0.) ? static_cast<data_t>(1.)
+                                                    : static_cast<data_t>(0.));
   }
 }
 
@@ -42,7 +43,7 @@ std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
   int numel = x.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
-  PD_DISPATCH_FLOATING_TYPES(
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "relu_cuda_forward_kernel", ([&] {
         relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
             x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
@@ -60,7 +61,7 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
   int numel = out.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
-  PD_DISPATCH_FLOATING_TYPES(
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       out.type(), "relu_cuda_backward_kernel", ([&] {
         relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
             grad_out.data<data_t>(),
diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
index fbf5442ac0..0435f50b7c 100644
--- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -118,3 +118,21 @@ PD_BUILD_OP(dispatch_test_float_and_integer_and_complex)
     .Inputs({"X"})
     .Outputs({"Out"})
     .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndHalf(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP(dispatch_test_float_and_half)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndHalf));
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 1a96fc5f0a..23733d2084 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -50,11 +50,17 @@ class TestJITLoad(unittest.TestCase):
             custom_module.custom_relu, custom_module.custom_relu_dup
         ]
         self.dtypes = ['float32', 'float64']
-        self.devices = ['cpu', 'gpu']
+        if paddle.is_compiled_with_cuda():
+            self.dtypes.append('float16')
+        self.devices = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            self.devices.append('gpu')
 
     def test_static(self):
         for device in self.devices:
             for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out = custom_relu_static(custom_op, device, dtype, x)
@@ -68,6 +74,8 @@ class TestJITLoad(unittest.TestCase):
     def test_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
@@ -87,7 +95,7 @@ class TestJITLoad(unittest.TestCase):
         caught_exception = False
         try:
             x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
-            custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'float32', x)
+            custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'int32', x)
         except OSError as e:
             caught_exception = True
             self.assertTrue(
@@ -105,15 +113,15 @@ class TestJITLoad(unittest.TestCase):
 
         caught_exception = False
         try:
-            x = np.random.uniform(-1, 1, [4, 8]).astype('int64')
-            custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'float32', x)
+            x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
+            custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x)
         except OSError as e:
             caught_exception = True
             self.assertTrue(
-                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int64_t`"
+                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32_t`"
                 in str(e))
             self.assertTrue(
-                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:49" in
+                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:50" in
                 str(e))
         self.assertTrue(caught_exception)
 
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 6781915e02..5c5c2d65a5 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -26,7 +26,7 @@ from paddle.utils.cpp_extension.extension_utils import run_cmd
 def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
     paddle.set_device(device)
 
-    t = paddle.to_tensor(np_x)
+    t = paddle.to_tensor(np_x, dtype=dtype)
     t.stop_gradient = False
 
     out = func(t) if use_func else paddle.nn.functional.relu(t)
@@ -171,7 +171,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
         ]
 
         self.dtypes = ['float32', 'float64']
-        self.devices = ['cpu', 'gpu']
+        if paddle.is_compiled_with_cuda():
+            self.dtypes.append('float16')
+        self.devices = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            self.devices.append('gpu')
 
         # config seed
         SEED = 2021
@@ -181,6 +185,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
     def test_static(self):
         for device in self.devices:
             for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out = custom_relu_static(custom_op, device, dtype, x)
@@ -194,6 +200,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
     def test_static_pe(self):
         for device in self.devices:
             for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out = custom_relu_static_pe(custom_op, device, dtype, x)
@@ -207,6 +215,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
     def test_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
index bc36372c6a..12e9f50a5e 100644
--- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
@@ -83,6 +83,12 @@ class TestJitDispatch(unittest.TestCase):
             self.run_dispatch_test(
                 dispatch_op.dispatch_test_float_and_integer_and_complex, dtype)
 
+    def test_dispatch_float_and_half(self):
+        dtypes = ["float32", "float64", "float16"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_half,
+                                   dtype)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 0afc3956a0..71d4afdb28 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -453,15 +453,12 @@ class InstallHeaders(Command):
 
     def copy_data_type_headers(self, header):
         if os.name == 'nt':
-            data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h']
+            data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h', 'platform\\float16.h']
         else:
-            data_type_headers = ['platform/complex64.h', 'platform/complex128.h']
+            data_type_headers = ['platform/complex64.h', 'platform/complex128.h', 'platform/float16.h']
         for dtype_header in data_type_headers:
             if dtype_header in header:
-                if os.name == 'nt':
-                    install_dir = os.path.join(self.install_dir, "paddle\\fluid\\extension\\include")
-                else:
-                    install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
+                install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
                 if not os.path.exists(install_dir):
                     self.mkpath(install_dir)
                 return self.copy_file(header, install_dir)
-- 
Gitee


From be5a481324c9fe0c477490e39f7ee6f536d507b1 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 14:31:28 +0800
Subject: [PATCH 25/45] !34 [oneDNN] Added Elementwise Mul grad  fp32/bf16
 Merge pull request !34 from
 PaddlePaddle-Gardener/prv-elementwise_mul-grad-bf16

---
 .../operators/elementwise/elementwise_op.h    |   5 +-
 .../mkldnn/elementwise_add_mkldnn_op.cc       |  11 ++
 .../mkldnn/elementwise_mkldnn_op.h            |   1 -
 .../mkldnn/elementwise_mul_mkldnn_op.cc       | 116 ++++++++++++++++++
 paddle/fluid/platform/mkldnn_reuse.h          |  10 +-
 .../test_elementwise_mul_bf16_mkldnn_op.py    |  66 ++++++++--
 .../mkldnn/test_elementwise_mul_mkldnn_op.py  |  12 +-
 7 files changed, 206 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 6ec73b02ad..e09f94a6c0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -276,7 +276,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
 #ifdef PADDLE_WITH_MKLDNN
     // If broadcasting is needed, use native implementation
-    auto CanMKLDNNElementwiseAddGradBeUsed = [&]() {
+    auto CanMKLDNNElementwiseGradBeUsed = [&]() {
       auto dx_dims = ctx.Input<Tensor>("X")->dims();
       auto dy_dims = ctx.Input<Tensor>("Y")->dims();
       // No broadcast or broadcasting of data on inner dims is supported
@@ -284,8 +284,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     };
 
     if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
-        (ctx.Type() != "elementwise_add_grad" ||
-         CanMKLDNNElementwiseAddGradBeUsed())) {
+        CanMKLDNNElementwiseGradBeUsed()) {
       return framework::OpKernelType(input_data_type, ctx.GetPlace(),
                                      framework::DataLayout::kMKLDNN,
                                      framework::LibraryType::kMKLDNN);
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 4db4adfe9e..b43dddfcf1 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -61,6 +61,9 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
                                            platform::EventRole::kUniqueOp);
       reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
       astream.wait();
+
+      dx->set_layout(DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
     }
 
     if (dy) {
@@ -75,6 +78,9 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         reorder_p->execute(astream, *reorder_src_memory_p,
                            *reorder_dst_memory_p);
         astream.wait();
+
+        dy->set_layout(DataLayout::kMKLDNN);
+        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
       } else {
         // Broadcasting
         platform::ReductionMKLDNNHandler<T> handler_sum(
@@ -86,6 +92,11 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
                                        {DNNL_ARG_DST, *dy_memory_p}});
         astream.wait();
+
+        dy->set_layout(DataLayout::kMKLDNN);
+        dy->set_format(
+            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
+                paddle::framework::vectorize<int64_t>(dy->dims()))));
       }
     }
   }
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index 8a646e5865..df827117a0 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <string>
 #include <unordered_map>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index 293b5a1a2d..c9209cc39d 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -14,6 +14,118 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    if (dx) {
+      // dx = dout*y
+      platform::BinaryMKLDNNHandler<T> handler(
+          dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine,
+          ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f,
+          ctx.InputName(framework::GradVarName("Out")));
+
+      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
+      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
+      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
+
+      const auto binary_prim = handler.AcquireForwardPrimitive();
+
+      const std::unordered_map<int, dnnl::memory> args = {
+          {DNNL_ARG_SRC_0, *src_dout_memory},
+          {DNNL_ARG_SRC_1, *src_y_memory},
+          {DNNL_ARG_DST, *dst_dx_memory}};
+
+      binary_prim->execute(astream, args);
+      astream.wait();
+
+      dx->set_layout(framework::DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
+    }
+
+    if (dy) {
+      // dy = dout*x
+      // Handler is having nullptr passed instead of output tensor as
+      // we want Dst buffer to be allocated by oneDNN not to use Tensor
+      platform::BinaryMKLDNNHandler<T> handler(
+          dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine,
+          ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f,
+          ctx.InputName(framework::GradVarName("Out")));
+
+      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
+      const auto src_x_memory = handler.AcquireSecondSrcMemory(x);
+
+      // If broadcasting is in use then let's write to temporary
+      // buffer allocated by oneDNN
+      const auto dst_dy_memory = (dout->dims() == dy->dims())
+                                     ? handler.AcquireDstMemory(dy)
+                                     : handler.AcquireDstMemory();
+
+      const auto binary_prim = handler.AcquireForwardPrimitive();
+
+      const std::unordered_map<int, dnnl::memory> args = {
+          {DNNL_ARG_SRC_0, *src_dout_memory},
+          {DNNL_ARG_SRC_1, *src_x_memory},
+          {DNNL_ARG_DST, *dst_dy_memory}};
+
+      binary_prim->execute(astream, args);
+      astream.wait();
+
+      dy->set_layout(framework::DataLayout::kMKLDNN);
+
+      // Reduction is needed for broadcasting scenario
+      if (dout->dims() != dy->dims()) {
+        platform::ReductionMKLDNNHandler<T> handler_sum(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, mkldnn_engine,
+            ctx.GetPlace(), dout, dy,
+            ctx.InputName(framework::GradVarName("Out")));
+        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
+        auto reduction_p = handler_sum.AcquireForwardPrimitive();
+        // As source we use mem object with results from binary operation
+        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
+                                       {DNNL_ARG_DST, *dy_memory_p}});
+        astream.wait();
+        dy->set_format(
+            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
+                paddle::framework::vectorize<int64_t>(dy->dims()))));
+
+      } else {
+        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(
@@ -23,3 +135,7 @@ REGISTER_OP_KERNEL(
                              dnnl::algorithm::binary_mul>,
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_mul>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_mul>)
+
+REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseMulMKLDNNGradKernel<paddle::platform::bfloat16>,
+                   ops::EltwiseMulMKLDNNGradKernel<float>)
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 0503c3f71a..c79b642c51 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -87,6 +87,11 @@ class MKLDNNHandlerT {
                                             "@dst_mem_p");
   }
 
+  template <typename T_out = T>
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(void) {
+    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), "@dstt_mem_p");
+  }
+
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
       const framework::Tensor* output) {
@@ -561,7 +566,10 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
 
       const auto src_x_tz = framework::vectorize(x->dims());
       const auto src_y_tz = framework::vectorize(y->dims());
-      const auto dst_tz = framework::vectorize(z->dims());
+      // if output tensor(z) is nullptr then we are computing into oneDNN
+      // managed buffer
+      const auto dst_tz =
+          (z == nullptr) ? src_x_tz : framework::vectorize(z->dims());
 
       const auto src0_md = dnnl::memory::desc(
           src_x_tz, platform::MKLDNNGetDataType<T>(), x->format());
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
index c2716420fb..9b7f4b9b86 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
@@ -30,10 +30,9 @@ class TestElementwiseMulBf16MklDNNOp(OpTest):
         self.axis = -1
 
         self.generate_data()
-        self.inputs = {
-            'X': convert_float_to_uint16(self.x),
-            'Y': convert_float_to_uint16(self.y)
-        }
+        self.x_bf16 = convert_float_to_uint16(self.x)
+        self.y_bf16 = convert_float_to_uint16(self.y)
+        self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
         self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
 
@@ -46,13 +45,66 @@ class TestElementwiseMulBf16MklDNNOp(OpTest):
         self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad_normal(self):
-        pass
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[
+                np.multiply(self.x, self.y), np.multiply(self.x, self.x)
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
 
     def test_check_grad_ingore_x(self):
-        pass
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[np.multiply(self.y, self.x)],
+            user_defined_grad_outputs=[self.y_bf16])
 
     def test_check_grad_ingore_y(self):
-        pass
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[np.multiply(self.x, self.y)],
+            user_defined_grad_outputs=[self.x_bf16])
+
+
+class TestElementwiseMulBroadcastingBf16MklDNNOp(
+        TestElementwiseMulBf16MklDNNOp):
+    def generate_data(self):
+        self.x = np.random.uniform(1, 2, [1, 2, 3, 100]).astype(np.float32)
+        self.y = np.random.uniform(1, 2, [100]).astype(np.float32)
+        self.out = np.multiply(self.x, self.y)
+
+    # Compute partial sums along all axes but last one
+    def compute_reduced_gradients(self, out_grads):
+        part_sum = np.add.reduceat(out_grads, [0], axis=0)
+        part_sum = np.add.reduceat(part_sum, [0], axis=1)
+        part_sum = np.add.reduceat(part_sum, [0], axis=2)
+        return part_sum.flatten()
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[
+                np.multiply(self.x, self.y),
+                self.compute_reduced_gradients(np.multiply(self.x, self.x))
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[
+                self.compute_reduced_gradients(np.multiply(self.x, self.x))
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
index d66f3dfb89..03dc2421b6 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
 from paddle.fluid.tests.unittests.test_elementwise_mul_op import ElementwiseMulOp
+from paddle import enable_static
 
 
 class TestMKLDNNElementwiseMulOp(ElementwiseMulOp):
@@ -51,13 +52,17 @@ class TestMKLDNNElementwiseMulOp4(TestMKLDNNElementwiseMulOp):
     def test_check_grad_normal(self):
         pass
 
-    def test_check_grad_ingore_x(self):
-        pass
-
     def test_check_grad_ingore_y(self):
         pass
 
 
+class TestMKLDNNElementwiseMulOp5(TestMKLDNNElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.multiply(self.x, self.y)
+
+
 ''' INT8 Tests '''
 
 
@@ -140,4 +145,5 @@ class TestUint8Scales(TestInt8Scales):
 
 
 if __name__ == '__main__':
+    enable_static()
     unittest.main()
-- 
Gitee


From b120ae71245990e95beb50dea0889058caffd6a3 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 14:36:02 +0800
Subject: [PATCH 26/45] !26 MaskRCNN OPs stream optimization Merge pull request
 !26 from PaddlePaddle-Gardener/stream_flow_opt

---
 paddle/fluid/operators/detection/generate_proposals_op.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 8359fbab51..e8ab628db1 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
       memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
                    scores.data<T>(), sizeof(T) * scores.numel(),
                    dev_ctx.stream());
-      dev_ctx.Wait();
       num_proposals += proposals.dims()[0];
       offset.emplace_back(num_proposals);
       tmp_num.push_back(proposals.dims()[0]);
-- 
Gitee


From f05648d6ec75d7fee929821f5aaa2a33427582fe Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 16:14:00 +0800
Subject: [PATCH 27/45] !36 fix bug of DepthwiseConvTransposeGradKernel Merge
 pull request !36 from PaddlePaddle-Gardener/bug_fix

---
 paddle/fluid/operators/conv_transpose_op.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 651719f105..ecf5b6d774 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -682,9 +682,9 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
     if (input_grad) {
       math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
       depthwiseConv(
-          dev_ctx, *output_grad, filter, strides, paddings,
+          dev_ctx, *output_grad, filter, strides,
           std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          input_grad, data_layout);
+          dilations, input_grad, data_layout);
     }
 
     if (filter_grad) {
-- 
Gitee


From 9e4fef9aa220711a10af0ad5a29495d58dcb616f Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 16:14:01 +0800
Subject: [PATCH 28/45] !37 [ROCM] fix test_conv2d_transpose_op Merge pull
 request !37 from PaddlePaddle-Gardener/bugfix_rocm_conv2d_transpose_op

---
 paddle/fluid/operators/conv_transpose_cudnn_op.cu             | 4 ++--
 .../paddle/fluid/tests/unittests/test_conv2d_transpose_op.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index 5781dd18b7..a712d31cf7 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -202,7 +202,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 
     int iwo_groups = groups;
     int c_groups = 1;
-#if CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
     iwo_groups = 1;
     c_groups = groups;
     groups = 1;
@@ -452,7 +452,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
     int iwo_groups = groups;
     int c_groups = 1;
-#if CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
     iwo_groups = 1;
     c_groups = groups;
     groups = 1;
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index fb6058c0f0..4e582d74c2 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -116,7 +116,7 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
 class TestConv2DTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.need_check_grad = True
         self.is_test = False
         self.use_cudnn = False
-- 
Gitee


From 9151fbb1aa3ccc91c372d04e4b8a2839e3074dc3 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Tue, 23 Mar 2021 16:14:01 +0800
Subject: [PATCH 29/45] !38 Fix skip_quant in QAT Merge pull request !38 from
 PaddlePaddle-Gardener/fix_skip_quant

---
 .../slim/quantization/imperative/qat.py       | 38 +++++++++++++++++--
 .../slim/quantization/imperative/utils.py     |  6 +++
 .../slim/tests/test_imperative_out_scale.py   |  7 ++++
 .../slim/tests/test_imperative_skip_op.py     | 16 ++++++--
 4 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index abfe06a332..68b4cfdc66 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -515,6 +515,8 @@ class ImperativeCalcOutputScale(object):
                                 self._out_scale_dict[ops_list[op_count]])
                             op_count += 1
 
+            self._set_skip_quant_attr(inference_program)
+
         # save the final quantized model that has output scales
         save_inference_model(
             dirname=dirname,
@@ -537,9 +539,12 @@ class ImperativeCalcOutputScale(object):
         Init the scale params for calculating output scales and save them in the
         target layer.
         After the users define the dygraph model, the hooks for calculating output
-        scales will not execute immediately. If the users load the checkpoint now,
-        the scale params have not been created, so them cann't be loaded.
-        Therefore, define the scale params in the beginning.
+        scales will not execute immediately. If the users load parameters form
+        checkpoint and save the quantized inference model immediately, the inference
+        model would not be saved successfully. Beacuse the dygraph_to_static requires
+        that the parameters created in __init__, but the uniqueness of hook make it
+        impossible to create parameters in __init__. To avoid this mistake, we define
+        the scale parameters in the beginning instead of hook.
         """
 
         def _create_param(in_layer, first_name, last_name, dtype):
@@ -587,6 +592,33 @@ class ImperativeCalcOutputScale(object):
             op_type = op_type.replace('relu', 're_lu')
         return op_type in layer_name
 
+    def _set_skip_quant_attr(self, program):
+        block = program.global_block()
+        for op in block.ops:
+            if self._is_skip_quant_op(block, op):
+                op._set_attr("skip_quant", True)
+
+    def _is_skip_quant_op(self, block, in_op):
+        """
+        The input op should be skipped quantization.
+        1. the type of input op should be conv2d, depthwise_conv2d or matmul
+        2. the previous ops of the input op are not fake_quantize_dequantize ops
+        """
+
+        def _find_previous_op(block, var_name):
+            for op in block.ops:
+                if var_name in op.output_arg_names:
+                    return op
+
+        target_op_types = ["conv2d", "depthwise_conv2d", "matmul"]
+        if in_op.type not in target_op_types:
+            return False
+
+        previous_ops = [_find_previous_op(block, arg_name) \
+            for arg_name in in_op.input_arg_names]
+        return any(op is not None and op.type not in utils.fake_quantize_dequantize_types \
+            for op in previous_ops )
+
     def _calc_output_scale_hook(self, layer, input, output):
         """
         Create the MovingAverageAbsMaxScale layer for the target layer if needed.
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index 1ff4a408e0..3bf655265c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -52,6 +52,12 @@ supported_quant_layers_map = {
     'LayerNorm': paddle.nn.LayerNorm,
 }
 
+fake_quantize_dequantize_types = [
+    "fake_quantize_dequantize_abs_max",
+    "fake_quantize_dequantize_channel_wise_abs_max",
+    "fake_quantize_dequantize_moving_average_abs_max"
+]
+
 out_scale_layers_list = (
     paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.MaxPool2D,
     paddle.nn.BatchNorm, paddle.nn.BatchNorm2D, paddle.nn.SyncBatchNorm,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index 83ddac4196..ed29375d22 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -393,12 +393,16 @@ class TestImperativeOutSclae(unittest.TestCase):
             if 'fake' in op.type:
                 static_ops.remove(op)
 
+        op_count = 0
         for i in range(len(dynamic_ops)):
             if dynamic_ops[i].has_attr("out_threshold"):
+                op_count += 1
                 self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
                 self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
                                 static_ops[i].attr("out_threshold"))
 
+        self.assertTrue(op_count == 13)
+
 
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
     def test_save_quantized_model(self):
@@ -459,11 +463,14 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
             if 'fake' in op.type:
                 static_ops.remove(op)
 
+        op_count = 0
         for i in range(len(dynamic_ops)):
             if dynamic_ops[i].has_attr("out_threshold"):
+                op_count += 1
                 self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
                 self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
                                 static_ops[i].attr("out_threshold"))
+        self.assertTrue(op_count == 13)
 
 
 class TestSaveQuantizedModel_Warning(unittest.TestCase):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 0561055e6e..bda02769ce 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -200,9 +200,12 @@ class TestImperativeOutSclae(unittest.TestCase):
                 params_filename="lenet" + INFER_PARAMS_SUFFIX))
         model_ops = inference_program.global_block().ops
 
-        conv2d_count, mul_count = 0, 0
+        conv2d_count, matmul_count = 0, 0
+        conv2d_skip_count, matmul_skip_count = 0, 0
         for i, op in enumerate(model_ops):
             if op.type == 'conv2d':
+                if op.has_attr("skip_quant"):
+                    conv2d_skip_count += 1
                 if conv2d_count > 0:
                     self.assertTrue(
                         'fake_quantize_dequantize' in model_ops[i - 1].type)
@@ -211,14 +214,19 @@ class TestImperativeOutSclae(unittest.TestCase):
                         'fake_quantize_dequantize' not in model_ops[i - 1].type)
                 conv2d_count += 1
 
-            if op.type == 'mul':
-                if mul_count > 0:
+            if op.type == 'matmul':
+                if op.has_attr("skip_quant"):
+                    matmul_skip_count += 1
+                if matmul_count > 0:
                     self.assertTrue(
                         'fake_quantize_dequantize' in model_ops[i - 1].type)
                 else:
                     self.assertTrue(
                         'fake_quantize_dequantize' not in model_ops[i - 1].type)
-                mul_count += 1
+                matmul_count += 1
+
+        self.assertTrue(conv2d_skip_count == 1)
+        self.assertTrue(matmul_skip_count == 1)
 
 
 if __name__ == '__main__':
-- 
Gitee


From d19aaf0c1934f77283ed96a8686a35abc2af340f Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:29:57 +0800
Subject: [PATCH 30/45] !39 [3D-parallel] add 1f1b scheduler for pipeline Merge
 pull request !39 from PaddlePaddle-Gardener/1f1b_pp

---
 paddle/fluid/framework/device_worker.h        |  20 +-
 .../framework/distributed_strategy.proto      |   1 +
 paddle/fluid/framework/pipeline_trainer.cc    |  10 +-
 paddle/fluid/framework/section_worker.cc      | 173 ++++++++++++------
 paddle/fluid/framework/trainer_desc.proto     |   3 +
 .../meta_optimizers/pipeline_optimizer.py     |   9 +-
 python/paddle/fluid/device_worker.py          |  12 ++
 python/paddle/fluid/optimizer.py              |   5 +
 .../fluid/tests/unittests/pipeline_mnist.py   |  23 ++-
 .../unittests/pipeline_mnist_one_device.py    |   4 +
 .../fluid/tests/unittests/test_pipeline.py    |   6 +-
 11 files changed, 193 insertions(+), 73 deletions(-)

diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 3038719539..05c54a90f7 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/heter_service.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -454,7 +455,7 @@ class HeterBoxWorker : public HogwildWorker {
   virtual void CacheProgram(const ProgramDesc& main_program) {
     new (&program_) ProgramDesc(main_program);
   }
-  virtual void ProduceTasks() override;
+  void ProduceTasks() override;
   virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
   virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
   virtual void TrainFilesWithProfiler() {}
@@ -555,7 +556,7 @@ class PSGPUWorker : public HogwildWorker {
   virtual void CacheProgram(const ProgramDesc& main_program) {
     new (&program_) ProgramDesc(main_program);
   }
-  virtual void ProduceTasks() override;
+  void ProduceTasks() override;
   virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
   virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
   void ResetStat();
@@ -659,6 +660,9 @@ class SectionWorker : public DeviceWorker {
   void SetDeviceIndex(int tid) override {}
   void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
   void SetMicrobatchNum(int num) { num_microbatches_ = num; }
+  void SetPipelineStageNum(int num) { num_pipeline_stages_ = num; }
+  void SetPipelineStage(int stage) { pipeline_stage_ = stage; }
+  void SetScheduleMode(int mode) { schedule_mode_ = mode; }
   void SetMicrobatchScopes(const std::vector<Scope*>& scope) {
     microbatch_scopes_ = scope;
   }
@@ -666,11 +670,23 @@ class SectionWorker : public DeviceWorker {
   void SetSkipVars(const std::vector<std::string>& skip_vars) {
     skip_vars_ = skip_vars;
   }
+  void RunBackward(
+      int micro_id, std::unique_ptr<GarbageCollector>&,
+      std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
+  void RunForward(
+      int micro_id, std::unique_ptr<GarbageCollector>&,
+      std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
+  void RunUpdate(
+      std::unique_ptr<GarbageCollector>&,
+      std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
 
  protected:
   int section_id_;
   int thread_id_;
   int num_microbatches_;
+  int num_pipeline_stages_;
+  int pipeline_stage_;
+  int schedule_mode_;  // 0 for F-then-B and 1 for 1F1B
   std::vector<Scope*> microbatch_scopes_;
   std::vector<std::string> skip_vars_;
   const Scope* minibatch_scope_;
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 300f0eb0db..b36793507f 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -120,6 +120,7 @@ message AsyncConfig {
 message PipelineConfig {
   optional int32 micro_batch_size = 1 [ default = 1 ];
   optional int32 accumulate_steps = 2 [ default = 1 ];
+  optional string schedule_mode = 3 [ default = '1F1B' ];
 }
 
 message DistributedStrategy {
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 8d350f7016..a97fc2e75a 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -24,6 +24,9 @@ namespace framework {
 void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
                                  Dataset* dataset) {
   const auto& section_params = trainer_desc.section_param();
+  const int num_pipeline_stages_ = section_params.num_pipeline_stages();
+  const int pipeline_stage_ = section_params.pipeline_stage();
+  const int schedule_mode_ = section_params.schedule_mode();
   num_microbatches_ = section_params.num_microbatches();
   VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_;
   trainer_desc_ = trainer_desc;
@@ -39,6 +42,9 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   this_worker->SetPlace(place_);
   this_worker->Initialize(trainer_desc);
   this_worker->SetMicrobatchNum(num_microbatches_);
+  this_worker->SetPipelineStageNum(num_pipeline_stages_);
+  this_worker->SetPipelineStage(pipeline_stage_);
+  this_worker->SetScheduleMode(schedule_mode_);
 }
 
 void PipelineTrainer::InitOtherEnv(const ProgramDesc& main_program) {
@@ -75,7 +81,9 @@ void PipelineTrainer::CopyParameters(int microbatch_id,
   for (auto& var : global_block.AllVars()) {
     bool is_param_grad = false;
     size_t pos = 0;
-    if ((pos = var->Name().find(kGradVarSuffix)) != std::string::npos) {
+    // A magic suffix to indicate the merged gradient
+    std::string magicSuffix = std::string(kGradVarSuffix) + "@MERGED";
+    if ((pos = var->Name().find(magicSuffix)) != std::string::npos) {
       auto prefix_name = var->Name().substr(0, pos);
       if (param_map.find(prefix_name) != param_map.end()) {
         is_param_grad = true;
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 735c86faf0..90a371e474 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -22,15 +22,79 @@ class TrainerDesc;
 
 uint64_t SectionWorker::batch_id_(0);
 
-void SectionWorker::Initialize(const TrainerDesc& desc) {
+void SectionWorker::Initialize(const TrainerDesc &desc) {
   dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
   program_.reset(
       new ProgramDesc(desc.section_param().section_config().program_desc()));
-  for (auto& op_desc : program_->Block(0).AllOps()) {
+  for (auto &op_desc : program_->Block(0).AllOps()) {
     ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
 }
 
+void SectionWorker::RunForward(
+    int micro_id, std::unique_ptr<GarbageCollector> &gc,
+    std::unordered_map<const OperatorBase *, std::vector<std::string>>
+        &unused_vars_) {
+  for (auto &op : ops_) {
+    int op_role = op->Attr<int>(std::string("op_role"));
+    // We run op with op_role = kLRSched only for the first microbatch
+    // to avoid increasing the @LR_DECAY_STEP@ multiple times.
+    bool run_first_mbatch = op_role == static_cast<int>(OpRole::kForward) ||
+                            op_role == (static_cast<int>(OpRole::kForward) |
+                                        static_cast<int>(OpRole::kLoss)) ||
+                            op_role == static_cast<int>(OpRole::kLRSched);
+    bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
+                      op_role == (static_cast<int>(OpRole::kForward) |
+                                  static_cast<int>(OpRole::kLoss));
+    if ((micro_id == 0 && run_first_mbatch) || (micro_id != 0 && run_others)) {
+      VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch "
+              << micro_id;
+      op->Run(*microbatch_scopes_[micro_id], place_);
+      if (gc) {
+        DeleteUnusedTensors(*microbatch_scopes_[micro_id], op.get(),
+                            unused_vars_, gc.get());
+      }
+    }
+  }
+}
+
+void SectionWorker::RunBackward(
+    int micro_id, std::unique_ptr<GarbageCollector> &gc,
+    std::unordered_map<const OperatorBase *, std::vector<std::string>>
+        &unused_vars_) {
+  for (auto &op : ops_) {
+    int op_role = op->Attr<int>(std::string("op_role"));
+    if (op_role == static_cast<int>(OpRole::kBackward) ||
+        op_role == (static_cast<int>(OpRole::kBackward) |
+                    static_cast<int>(OpRole::kLoss))) {
+      VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch "
+              << micro_id;
+      op->Run(*microbatch_scopes_[micro_id], place_);
+      if (gc) {
+        DeleteUnusedTensors(*microbatch_scopes_[micro_id], op.get(),
+                            unused_vars_, gc.get());
+      }
+    }
+  }
+}
+
+void SectionWorker::RunUpdate(
+    std::unique_ptr<GarbageCollector> &gc,
+    std::unordered_map<const OperatorBase *, std::vector<std::string>>
+        &unused_vars_) {
+  for (auto &op : ops_) {
+    int op_role = op->Attr<int>(std::string("op_role"));
+    if (op_role == static_cast<int>(OpRole::kOptimize)) {
+      VLOG(3) << "Update: running op " << op->Type();
+      op->Run(*microbatch_scopes_[num_microbatches_ - 1], place_);
+      if (gc) {
+        DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1],
+                            op.get(), unused_vars_, gc.get());
+      }
+    }
+  }
+}
+
 void SectionWorker::TrainFiles() {
   VLOG(5) << "begin section_worker TrainFiles";
 
@@ -48,69 +112,56 @@ void SectionWorker::TrainFiles() {
 #endif
   }
 
-  for (int i = 0; i < num_microbatches_; ++i) {
-    for (auto& op : ops_) {
-      int op_role = op->Attr<int>(std::string("op_role"));
-      // We run op with op_role = kLRSched only for the first microbatch
-      // to avoid increasing the @LR_DECAY_STEP@ multiple times.
-      bool run_first_mbatch = op_role == static_cast<int>(OpRole::kForward) ||
-                              op_role == (static_cast<int>(OpRole::kForward) |
-                                          static_cast<int>(OpRole::kLoss)) ||
-                              op_role == static_cast<int>(OpRole::kLRSched);
-      bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
-                        op_role == (static_cast<int>(OpRole::kForward) |
-                                    static_cast<int>(OpRole::kLoss));
-      if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
-        VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch "
-                << i;
-        op->Run(*microbatch_scopes_[i], place_);
-        if (gc) {
-          DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_,
-                              gc.get());
-        }
-      }
+  if (schedule_mode_ == 0) {
+    // F-then-B scheduler which runs Forward phase for all microbatches,
+    // then runs Backward phase for all microbatches.
+    // step1: run forward
+    for (int i = 0; i < num_microbatches_; ++i) {
+      RunForward(i, gc, unused_vars_);
     }
-#ifdef PADDLE_WITH_RCCL
-    hipDeviceSynchronize();
-#else
-    cudaDeviceSynchronize();
-#endif
-  }
-
-  // backward pass
-  for (int i = 0; i < num_microbatches_; ++i) {
-    for (auto& op : ops_) {
-      int op_role = op->Attr<int>(std::string("op_role"));
-      if (op_role == static_cast<int>(OpRole::kBackward) ||
-          op_role == (static_cast<int>(OpRole::kBackward) |
-                      static_cast<int>(OpRole::kLoss))) {
-        VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch "
-                << i;
-        op->Run(*microbatch_scopes_[i], place_);
-        if (gc) {
-          DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_,
-                              gc.get());
-        }
-      }
+    // step2: run backward
+    for (int i = 0; i < num_microbatches_; ++i) {
+      RunBackward(i, gc, unused_vars_);
+    }
+    // step3: run update
+    RunUpdate(gc, unused_vars_);
+  } else {
+    // 1F1B scheduler, which runs forward phase and backward phase altertively
+    // after startup phase. For a stage, the number of microbatches for
+    // startup is num_pipeline_stages_ - pipeline_stage_ - 1, where
+    // num_pipeline_stages_ is the total number of pipeline stages and
+    // pipeline_stage_ is the pipeline stage of the current device.
+    auto startup_steps = num_pipeline_stages_ - pipeline_stage_ - 1;
+    VLOG(3) << "startup_steps:" << startup_steps
+            << ", num_stages: " << num_pipeline_stages_
+            << ", stage:" << pipeline_stage_;
+    PADDLE_ENFORCE_GT(
+        num_microbatches_, startup_steps,
+        platform::errors::InvalidArgument(
+            "To use pipeline with 1F1B scheduler, please make sure number of "
+            "microbatches (%d) is than startup steps (%d).",
+            num_microbatches_, startup_steps));
+    int fw_step = 0;
+    int bw_step = 0;
+    // startup phase
+    while (fw_step < startup_steps) {
+      RunForward(fw_step, gc, unused_vars_);
+      fw_step += 1;
     }
-#ifdef PADDLE_WITH_RCCL
-    hipDeviceSynchronize();
-#else
-    cudaDeviceSynchronize();
-#endif
-  }
 
-  // update pass
-  for (auto& op : ops_) {
-    int op_role = op->Attr<int>(std::string("op_role"));
-    if (op_role == static_cast<int>(OpRole::kOptimize)) {
-      VLOG(3) << "Update: running op " << op->Type();
-      op->Run(*microbatch_scopes_[0], place_);
-      if (gc) {
-        DeleteUnusedTensors(*microbatch_scopes_[0], op.get(), unused_vars_,
-                            gc.get());
-      }
+    // 1f1b phase
+    while (fw_step < num_microbatches_) {
+      RunForward(fw_step, gc, unused_vars_);
+      fw_step += 1;
+      RunBackward(bw_step, gc, unused_vars_);
+      bw_step += 1;
+    }
+    // backward phase
+    while (bw_step < num_microbatches_) {
+      RunBackward(bw_step, gc, unused_vars_);
+      bw_step += 1;
     }
+    RunUpdate(gc, unused_vars_);
   }
   dev_ctx_->Wait();
   ++batch_id_;
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 70481cf372..504885ff5c 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -93,6 +93,9 @@ message SectionWorkerParameter {
   optional int32 start_cpu_core_id = 4 [ default = 1 ];
   repeated string param_need_sync = 5;
   optional int32 num_microbatches = 6;
+  optional int32 num_pipeline_stages = 7 [ default = 1 ];
+  optional int32 pipeline_stage = 8 [ default = 1 ];
+  optional int32 schedule_mode = 9 [ default = 0 ];
 }
 
 message SectionConfig {
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index 1b79de03fd..9535c9ef53 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -138,7 +138,10 @@ class PipelineOptimizer(MetaOptimizerBase):
         super(PipelineOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "RecomputeOptimizer",
+            "AMPOptimizer",
+        ]
         self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
@@ -149,6 +152,8 @@ class PipelineOptimizer(MetaOptimizerBase):
             'micro_batch_size']
         self.num_microbatches = user_defined_strategy.pipeline_configs[
             'accumulate_steps']
+        self.schedule_mode = user_defined_strategy.pipeline_configs[
+            'schedule_mode']
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
@@ -167,6 +172,7 @@ class PipelineOptimizer(MetaOptimizerBase):
         dist_strategy.pipeline_configs = {
             "micro_batch_size": 1,
             "accumulate_steps": 1,
+            "schedule_mode": "1F1B",
         }
 
     def minimize_impl(self,
@@ -192,6 +198,7 @@ class PipelineOptimizer(MetaOptimizerBase):
         loss.block.program._pipeline_opt['local_rank'] = self.rank
         loss.block.program._pipeline_opt[
             'micro_batch_size'] = self.micro_batch_size
+        loss.block.program._pipeline_opt['schedule_mode'] = self.schedule_mode
         optimize_ops, params_grads, prog_list = self.wrapped_opt.minimize(
             loss, startup_program, parameter_list, no_grad_set)
         assert prog_list
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 838aea37f1..b923f36af8 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -413,6 +413,18 @@ class Section(DeviceWorker):
         section_param = trainer_desc.section_param
         section_param.num_microbatches = pipeline_opt["num_microbatches"]
         section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"]
+        section_param.pipeline_stage = pipeline_opt["pipeline_stage"]
+        section_param.num_pipeline_stages = pipeline_opt["num_pipeline_stages"]
+        schedule_mode_str = pipeline_opt["schedule_mode"]
+        # F-then-B scheduler which runs Forward phase for all microbatches,
+        # then runs Backward phase for all microbatches.
+        # 1F1B scheduler, which runs forward phase and backward phase altertively
+        # after startup phase.
+        assert schedule_mode_str in ["F-then-B", "1F1B"], (
+            "The schedule mode "
+            "for pipeline must be one of F-then-B or 1F1B")
+        schedule_mode = 0 if schedule_mode_str == "F-then-B" else 1
+        section_param.schedule_mode = schedule_mode
         cfg = section_param.section_config
         program = pipeline_opt["section_program"]
         cfg.program_desc.ParseFromString(program["program"]._get_desc()
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 80f49ea939..9c724cbfdd 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4273,6 +4273,7 @@ class PipelineOptimizer(object):
             grad_name = self._append_grad_suffix(param_name)
             if not main_block.has_var(grad_name): continue
             grad_var = main_block.vars[grad_name]
+            grad_var.persistable = True
             main_block._insert_op(
                 index=0,
                 type='fill_constant',
@@ -4517,6 +4518,7 @@ class PipelineOptimizer(object):
                 "You must use pipeline with fleet"
         local_rank = main_program._pipeline_opt['local_rank'] % len(
             device_specs)
+        self.schedule_mode = main_program._pipeline_opt['schedule_mode']
 
         place_list = []
         for dev_spec in device_specs:
@@ -4543,6 +4545,9 @@ class PipelineOptimizer(object):
         main_program._pipeline_opt = {
             "trainer": "PipelineTrainer",
             "device_worker": "Section",
+            "pipeline_stage": local_rank,
+            "num_pipeline_stages": len(device_specs),
+            "schedule_mode": self.schedule_mode,
             "inner_parallelism": len(device_specs),
             "section_program": program_list[local_rank],
             "place": place_list[local_rank],
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
index d06be76b33..f433af2481 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
@@ -110,22 +110,31 @@ class TestDistMnist2x2(TestDistRunnerBase):
         lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
         opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9)
 
-        # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-
+        acc_steps = 2  # accumulated steps for pipeline
         if dist_strategy:
+            # Reader
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size)
             fleet.init(is_collective=True)
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
-            strategy.pipeline_configs = {'micro_batch_size': batch_size, }
+            strategy.pipeline_configs = {
+                'micro_batch_size': batch_size,
+                'schedule_mode': '1F1B',
+                'accumulate_steps': acc_steps
+            }
             dist_opt = fleet.distributed_optimizer(
                 optimizer=opt, strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
+            # Reader
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
 
         if dist_strategy:
             return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
index d8d28ac109..41b3ad3410 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
@@ -122,6 +122,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
         if dist_strategy:
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
+            strategy.pipeline_configs = {
+                'schedule_mode': 'F-then-B',
+                'micro_batch_size': batch_size
+            }
             dist_opt = fleet.distributed_optimizer(
                 optimizer=opt, strategy=strategy)
             dist_opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py
index e6d585e5bc..cd592416c1 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
@@ -34,9 +34,13 @@ class TestPipeline(TestDistBase):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
+            # TODO (sandyhouse) fix the delta value.
+            # Now pipeline only gets the loss value of the last
+            # microbatch, so it is not consistable with the
+            # non-pipeline one.
             self.check_with_place(
                 "pipeline_mnist.py",
-                delta=1e-5,
+                delta=1e0,
                 check_error_log=True,
                 log_name=flag_name)
 
-- 
Gitee


From a376657b66c5da5f8d22280e078be08e7b5867e3 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:29:59 +0800
Subject: [PATCH 31/45] !40 [oneDNN] Initial bf16 amp integration Merge pull
 request !40 from PaddlePaddle-Gardener/arlesniak/bf16_api

---
 paddle/fluid/operators/cast_op.cc             |   1 +
 paddle/fluid/operators/scale_op.cc            |   2 +
 .../fluid/contrib/mixed_precision/__init__.py |   3 +
 .../contrib/mixed_precision/fp16_lists.py     |   2 +-
 .../fluid/contrib/tests/test_bf16_utils.py    | 144 ++++++++++++++++++
 .../contrib/tests/test_model_cast_to_bf16.py  | 138 +++++++++++++++++
 python/paddle/fluid/data_feeder.py            |  23 ++-
 python/paddle/fluid/layers/nn.py              |  16 +-
 .../fluid/tests/book/test_fit_a_line.py       |  17 ++-
 .../fluid/tests/book/test_word2vec_book.py    |  29 ++--
 .../paddle/fluid/tests/unittests/op_test.py   |  17 +--
 python/paddle/static/amp/__init__.py          |   3 +
 python/setup.py.in                            |   1 +
 tools/parallel_UT_rule.py                     |   1 +
 tools/static_mode_white_list.py               |   1 +
 15 files changed, 360 insertions(+), 38 deletions(-)
 create mode 100644 python/paddle/fluid/contrib/tests/test_bf16_utils.py
 create mode 100644 python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py

diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index c5cfa7a3ba..40f4b969ec 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -97,5 +97,6 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                        ops::CastOpKernel<CPU, bool>,
                        ops::CastOpKernel<CPU, uint8_t>,
                        ops::CastOpKernel<CPU, paddle::platform::float16>,
+                       ops::CastOpKernel<CPU, paddle::platform::bfloat16>,
                        ops::CastOpKernel<CPU, paddle::platform::complex64>,
                        ops::CastOpKernel<CPU, paddle::platform::complex128>);
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 281689d3bd..a9b1f299da 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -128,6 +128,8 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker,
 REGISTER_OP_CPU_KERNEL(
     scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::bfloat16>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, int16_t>,
diff --git a/python/paddle/fluid/contrib/mixed_precision/__init__.py b/python/paddle/fluid/contrib/mixed_precision/__init__.py
index a580ae5574..571b755b50 100644
--- a/python/paddle/fluid/contrib/mixed_precision/__init__.py
+++ b/python/paddle/fluid/contrib/mixed_precision/__init__.py
@@ -20,7 +20,10 @@ from . import fp16_lists
 from .fp16_lists import *
 from . import fp16_utils
 from .fp16_utils import *
+from . import bf16
+from .bf16 import *
 
 __all__ = decorator.__all__
 __all__ += fp16_lists.__all__
 __all__ += fp16_utils.__all__
+__all__ += bf16.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index c88ae2d9cb..6a524af4ee 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -69,7 +69,7 @@ class AutoMixedPrecisionLists(object):
                 self.unsupported_list.add(op_name)
 
 
-# The three sets listed below are changed dynamiclly. They don't contain all  
+# The three sets listed below are changed dynamiclly. They don't contain all
 # paddle ops currently.
 
 # The set of ops that support fp16 calculation and are considered numerically-
diff --git a/python/paddle/fluid/contrib/tests/test_bf16_utils.py b/python/paddle/fluid/contrib/tests/test_bf16_utils.py
new file mode 100644
index 0000000000..faf2307f81
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_bf16_utils.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.contrib.mixed_precision as amp
+from paddle.fluid import core
+import paddle
+
+paddle.enable_static()
+
+
+class AMPTest(unittest.TestCase):
+    def setUp(self):
+        self.bf16_list = copy.copy(amp.bf16.amp_lists.bf16_list)
+        self.fp32_list = copy.copy(amp.bf16.amp_lists.fp32_list)
+        self.gray_list = copy.copy(amp.bf16.amp_lists.gray_list)
+        self.amp_lists_ = None
+
+    def tearDown(self):
+        self.assertEqual(self.amp_lists_.bf16_list, self.bf16_list)
+        self.assertEqual(self.amp_lists_.fp32_list, self.fp32_list)
+        self.assertEqual(self.amp_lists_.gray_list, self.gray_list)
+
+    def test_amp_lists(self):
+        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16()
+
+    def test_amp_lists_1(self):
+        # 1. w={'exp}, b=None
+        self.bf16_list.add('exp')
+        self.fp32_list.remove('exp')
+
+        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'exp'})
+
+    def test_amp_lists_2(self):
+        # 2. w={'tanh'}, b=None
+        self.fp32_list.remove('tanh')
+        self.bf16_list.add('tanh')
+
+        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'tanh'})
+
+    def test_amp_lists_3(self):
+        # 3. w={'lstm'}, b=None
+        self.bf16_list.add('lstm')
+
+        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'lstm'})
+
+    def test_amp_lists_4(self):
+        # 4. w=None, b={'elementwise_add'}
+        self.bf16_list.remove('elementwise_add')
+        self.fp32_list.add('elementwise_add')
+
+        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'elementwise_add'})
+
+    def test_amp_lists_5(self):
+        # 5. w=None, b={'elementwise_add'}
+        self.fp32_list.add('elementwise_add')
+        self.bf16_list.remove('elementwise_add')
+
+        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'elementwise_add'})
+
+    def test_amp_lists_6(self):
+        # 6. w=None, b={'lstm'}
+        self.fp32_list.add('lstm')
+
+        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'lstm'})
+
+    def test_amp_lists_7(self):
+        self.fp32_list.add('reshape2')
+        self.gray_list.remove('reshape2')
+
+        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'reshape2'})
+
+    def test_amp_list_8(self):
+        self.bf16_list.add('reshape2')
+        self.gray_list.remove('reshape2')
+
+        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16(
+            custom_bf16_list={'reshape2'})
+
+
+class AMPTest2(unittest.TestCase):
+    def test_amp_lists_(self):
+        # 7. w={'lstm'} b={'lstm'}
+        # raise ValueError
+        self.assertRaises(ValueError, amp.AutoMixedPrecisionListsBF16,
+                          {'lstm'}, {'lstm'})
+
+    def test_find_op_index(self):
+        block = fluid.default_main_program().global_block()
+        op_desc = core.OpDesc()
+        idx = amp.bf16.amp_utils.find_op_index(block.desc, op_desc)
+        assert (idx == -1)
+
+    def test_is_in_fp32_varnames(self):
+        block = fluid.default_main_program().global_block()
+
+        var1 = block.create_var(name="X", shape=[3], dtype='float32')
+        var2 = block.create_var(name="Y", shape=[3], dtype='float32')
+        var3 = block.create_var(name="Z", shape=[3], dtype='float32')
+        op1 = block.append_op(
+            type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
+        op2 = block.append_op(
+            type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]})
+        amp_lists_1 = amp.AutoMixedPrecisionListsBF16(
+            custom_fp32_varnames={'X'})
+        assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_1)
+        amp_lists_2 = amp.AutoMixedPrecisionListsBF16(
+            custom_fp32_varnames={'Y'})
+        assert amp.bf16.amp_utils._is_in_fp32_varnames(op2, amp_lists_2)
+        assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_2)
+
+    def test_find_true_post_op(self):
+
+        block = fluid.default_main_program().global_block()
+
+        var1 = block.create_var(name="X", shape=[3], dtype='float32')
+        var2 = block.create_var(name="Y", shape=[3], dtype='float32')
+        var3 = block.create_var(name="Z", shape=[3], dtype='float32')
+        op1 = block.append_op(
+            type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
+        op2 = block.append_op(
+            type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]})
+        res = amp.bf16.amp_utils.find_true_post_op(block.ops, op1, "Y")
+        assert (res == [op2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py
new file mode 100644
index 0000000000..40ddcf2e66
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import unittest
+import numpy as np
+import paddle.fluid.layers as layers
+import paddle.static.amp as amp
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestModelCastBF16(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.seed = 111
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    @contextlib.contextmanager
+    def static_graph(self):
+        with self.scope_prog_guard():
+            paddle.seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
+            yield
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+    def get_static_graph_result(self, feed, fetch_list, amp_fun,
+                                with_lod=False):
+        exe = fluid.Executor(core.CPUPlace())
+        exe.run(fluid.default_startup_program())
+        prog = fluid.default_main_program()
+        if amp_fun is not None:
+            amp_fun(prog)
+        return exe.run(prog,
+                       feed=feed,
+                       fetch_list=fetch_list,
+                       return_numpy=(not with_lod))
+
+    def test_graph_rewrite(self):
+        size = 3
+        n = np.ones([size, size], dtype='float32') * 3.2
+        nn = np.ones([size, size], dtype='float32') * -2.7
+
+        n_bf16 = amp.convert_float_to_uint16(n)
+        nn_bf16 = amp.convert_float_to_uint16(nn)
+
+        with self.static_graph():
+            t_bf16 = layers.data(
+                name='t_bf16', shape=[size, size], dtype=np.uint16)
+            tt_bf16 = layers.data(
+                name='tt_bf16', shape=[size, size], dtype=np.uint16)
+            t = layers.data(name='t', shape=[size, size], dtype='float32')
+            tt = layers.data(name='tt', shape=[size, size], dtype='float32')
+
+            ret = layers.elementwise_add(t, tt)
+            ret = layers.elementwise_mul(ret, t)
+            ret = layers.reshape(ret, [0, 0])
+
+            with amp.bf16_guard():
+                ret_bf16 = layers.elementwise_add(t_bf16, tt_bf16)
+                ret_bf16 = layers.elementwise_mul(ret_bf16, t_bf16)
+                ret_bf16 = layers.reshape(ret_bf16, [0, 0])
+
+            with amp.bf16_guard():
+                ret_fp32bf16 = layers.elementwise_add(t, tt)
+                ret_fp32bf16 = layers.elementwise_mul(ret_fp32bf16, t)
+                ret_fp32bf16 = layers.reshape(ret_fp32bf16, [0, 0])
+
+            static_ret_bf16, static_ret, ret_fp32bf16 = self.get_static_graph_result(
+                feed={
+                    't': n,
+                    'tt': nn,
+                    't_bf16': n_bf16,
+                    'tt_bf16': nn_bf16,
+                },
+                fetch_list=[ret_bf16, ret, ret_fp32bf16],
+                amp_fun=lambda prog: amp.rewrite_program_bf16(prog, use_bf16_guard=True))
+
+        self.assertTrue(np.allclose(static_ret_bf16, static_ret, 1e-2))
+        self.assertTrue(np.allclose(static_ret_bf16, ret_fp32bf16, 1e-2))
+
+        with self.static_graph():
+            t = layers.data(name='t', shape=[size, size], dtype='float32')
+            tt = layers.data(name='tt', shape=[size, size], dtype='float32')
+
+            with amp.bf16_guard():
+                ret = layers.elementwise_add(t, tt)
+                ret = layers.reshape(ret, [0, 0], act='elu')
+                ret = layers.elementwise_mul(ret, t)
+            ret = layers.elementwise_add(ret, tt)
+
+            static_ret_bf16 = \
+                self.get_static_graph_result(
+                    feed={'t': n, 'tt': nn},
+                    fetch_list=[ret],
+                    amp_fun=lambda prog: amp.rewrite_program_bf16(
+                        prog,
+                        amp.AutoMixedPrecisionListsBF16(
+                            custom_fp32_varnames={'elementwise_add_0.tmp_0'}),
+                        use_bf16_guard=True
+                    )
+                )
+        self.assertTrue(
+            static_ret_bf16, np.ones(
+                [size, size], dtype='float32') * -1.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index b2db00296b..52be7493cf 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -29,6 +29,7 @@ __all__ = ['DataFeeder']
 _PADDLE_DTYPE_2_NUMPY_DTYPE = {
     core.VarDesc.VarType.BOOL: 'bool',
     core.VarDesc.VarType.FP16: 'float16',
+    core.VarDesc.VarType.BF16: 'uint16',
     core.VarDesc.VarType.FP32: 'float32',
     core.VarDesc.VarType.FP64: 'float64',
     core.VarDesc.VarType.INT8: 'int8',
@@ -47,16 +48,18 @@ def convert_dtype(dtype):
             return _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype]
     elif isinstance(dtype, type):
         if dtype in [
-                np.bool, np.float16, np.float32, np.float64, np.int8, np.int16,
-                np.int32, np.int64, np.uint8, np.complex64, np.complex128
+                np.bool, np.float16, np.uint16, np.float32, np.float64, np.int8,
+                np.int16, np.int32, np.int64, np.uint8, np.complex64,
+                np.complex128
         ]:
             return dtype.__name__
     else:
         if dtype in [
-                'bool', 'float16', 'float32', 'float64', 'int8', 'int16',
-                'int32', 'int64', 'uint8', 'complex64', 'complex128', u'bool',
-                u'float16', u'float32', u'float64', u'int8', u'int16', u'int32',
-                u'int64', u'uint8', u'complex64', u'complex128'
+                'bool', 'float16', 'uint16', 'float32', 'float64', 'int8',
+                'int16', 'int32', 'int64', 'uint8', 'complex64', 'complex128',
+                u'bool', u'float16', u'uint16', u'float32', u'float64', u'int8',
+                u'int16', u'int32', u'int64', u'uint8', u'complex64',
+                u'complex128'
         ]:
             # this code is a little bit dangerous, since error could happen
             # when casting no-ascii code to str in python2.
@@ -66,7 +69,7 @@ def convert_dtype(dtype):
             return str(dtype)
 
     raise TypeError(
-        "dtype must be any of [bool, float16, float32, float64, int8, int16, "
+        "dtype must be any of [bool, float16, uint16, float32, float64, int8, int16, "
         "int32, int64, uint8, complex64, complex128], but received %s" % dtype)
 
 
@@ -123,6 +126,12 @@ def check_dtype(input_dtype,
         warnings.warn(
             "The data type of '%s' in %s only support float16 in GPU now. %s" %
             (input_name, op_name, extra_message))
+    if convert_dtype(input_dtype) in ['uint16'] and op_name not in [
+            'reshape', 'lookup_table', 'scale'
+    ]:
+        warnings.warn(
+            "The data type of '%s' in %s only support bfloat16 in OneDNN now. %s"
+            % (input_name, op_name, extra_message))
     if convert_dtype(input_dtype) not in expected_dtype:
         raise TypeError(
             "The data type of '%s' in %s must be %s, but received %s. %s" %
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fa8df14c86..00d1db19fc 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6137,9 +6137,9 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
 
         return dygraph_utils._append_activation_in_dygraph(out, act)
 
-    check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64',
-                 'bool'], 'reshape')
+    check_variable_and_dtype(x, 'x', [
+        'float16', 'float32', 'float64', 'int32', 'int64', 'bool', 'uint16'
+    ], 'reshape')
     check_type(shape, 'shape', (list, tuple, Variable), 'reshape')
     check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape')
 
@@ -11354,9 +11354,11 @@ def _elementwise_op(helper):
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     assert y is not None, 'y cannot be None in {}'.format(op_type)
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], op_type)
+        x, 'x', ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+        op_type)
     check_variable_and_dtype(
-        y, 'y', ['float16', 'float32', 'float64', 'int32', 'int64'], op_type)
+        y, 'y', ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+        op_type)
 
     axis = helper.kwargs.get('axis', -1)
     use_mkldnn = helper.kwargs.get('use_mkldnn', False)
@@ -11428,8 +11430,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
         return dygraph_utils._append_activation_in_dygraph(out)
 
     check_variable_and_dtype(x, "x", [
-        'float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64',
-        'uint8'
+        'float16', 'uint16', 'float32', 'float64', 'int8', 'int16', 'int32',
+        'int64', 'uint8'
     ], "scale")
     inputs = {'X': [x]}
     attrs = {
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 9a2cc4ab1a..df43d9366f 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -26,7 +26,7 @@ import os
 paddle.enable_static()
 
 
-def train(use_cuda, save_dirname, is_local):
+def train(use_cuda, save_dirname, is_local, use_bf16):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
     y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -37,6 +37,8 @@ def train(use_cuda, save_dirname, is_local):
     avg_cost = fluid.layers.mean(cost)
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    if use_bf16:
+        paddle.static.amp.rewrite_program_bf16(fluid.default_main_program())
     sgd_optimizer.minimize(avg_cost)
 
     BATCH_SIZE = 20
@@ -133,14 +135,17 @@ def infer(use_cuda, save_dirname=None):
         print("ground truth: ", test_label)
 
 
-def main(use_cuda, is_local=True):
+def main(use_cuda, is_local=True, use_bf16=False):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
+    if use_bf16 and not fluid.core.is_compiled_with_mkldnn():
+        return
+
     # Directory for saving the trained model
     save_dirname = "fit_a_line.inference.model"
 
-    train(use_cuda, save_dirname, is_local)
+    train(use_cuda, save_dirname, is_local, use_bf16)
     infer(use_cuda, save_dirname)
 
 
@@ -153,6 +158,12 @@ class TestFitALine(unittest.TestCase):
         with self.program_scope_guard():
             main(use_cuda=True)
 
+    @unittest.skipIf(not fluid.core.supports_bfloat16(),
+                     "place does not support BF16 evaluation")
+    def test_bf16(self):
+        with self.program_scope_guard():
+            main(use_cuda=False, use_bf16=True)
+
     @contextlib.contextmanager
     def program_scope_guard(self):
         prog = fluid.Program()
diff --git a/python/paddle/fluid/tests/book/test_word2vec_book.py b/python/paddle/fluid/tests/book/test_word2vec_book.py
index e33b1cc514..ad7550fa9d 100644
--- a/python/paddle/fluid/tests/book/test_word2vec_book.py
+++ b/python/paddle/fluid/tests/book/test_word2vec_book.py
@@ -39,7 +39,12 @@ def get_place(target):
             format(target))
 
 
-def train(target, is_sparse, is_parallel, save_dirname, is_local=True):
+def train(target,
+          is_sparse,
+          is_parallel,
+          save_dirname,
+          is_local=True,
+          use_bf16=False):
     PASS_NUM = 100
     EMBED_SIZE = 32
     HIDDEN_SIZE = 256
@@ -101,6 +106,8 @@ def train(target, is_sparse, is_parallel, save_dirname, is_local=True):
         raise NotImplementedError()
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    if use_bf16:
+        paddle.static.amp.rewrite_program_bf16(fluid.default_main_program())
     sgd_optimizer.minimize(avg_cost)
 
     train_reader = paddle.batch(
@@ -239,12 +246,15 @@ def infer(target, save_dirname=None):
             assert np.isclose(a, b, rtol=5e-5), "a: {}, b: {}".format(a, b)
 
 
-def main(target, is_sparse, is_parallel):
+def main(target, is_sparse, is_parallel, use_bf16):
     if target == "cuda" and not fluid.core.is_compiled_with_cuda():
         return
     if target == "xpu" and not fluid.core.is_compiled_with_xpu():
         return
 
+    if use_bf16 and not fluid.core.is_compiled_with_mkldnn():
+        return
+
     if not is_parallel:
         save_dirname = "word2vec.inference.model"
     else:
@@ -255,7 +265,7 @@ def main(target, is_sparse, is_parallel):
         # so only inference is turned on.
         train("cpu", is_sparse, is_parallel, save_dirname)
     else:
-        train(target, is_sparse, is_parallel, save_dirname)
+        train(target, is_sparse, is_parallel, save_dirname, use_bf16=use_bf16)
     infer(target, save_dirname)
 
 
@@ -268,10 +278,11 @@ class W2VTest(unittest.TestCase):
     pass
 
 
-def inject_test_method(target, is_sparse, is_parallel):
-    fn_name = "test_{0}_{1}_{2}".format(target, "sparse"
-                                        if is_sparse else "dense", "parallel"
-                                        if is_parallel else "normal")
+def inject_test_method(target, is_sparse, is_parallel, use_bf16=False):
+    fn_name = "test_{0}_{1}_{2}{3}".format(target, "sparse"
+                                           if is_sparse else "dense", "parallel"
+                                           if is_parallel else "normal", "_bf16"
+                                           if use_bf16 else "")
 
     def __impl__(*args, **kwargs):
         prog = fluid.Program()
@@ -279,8 +290,7 @@ def inject_test_method(target, is_sparse, is_parallel):
         scope = fluid.core.Scope()
         with fluid.scope_guard(scope):
             with fluid.program_guard(prog, startup_prog):
-                main(
-                    target=target, is_sparse=is_sparse, is_parallel=is_parallel)
+                main(target, is_sparse, is_parallel, use_bf16)
 
     if (not fluid.core.is_compiled_with_cuda() or
             target == "cuda") and is_sparse:
@@ -297,6 +307,7 @@ for target in ("cuda", "cpu", "xpu"):
     for is_sparse in (False, True):
         for is_parallel in (False, ):
             inject_test_method(target, is_sparse, is_parallel)
+inject_test_method("cpu", False, False, use_bf16=True)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 939e2ac0f5..dff96a8cbc 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -244,17 +244,12 @@ def convert_float_to_uint16(float_list, data_format="NCHW"):
     return new_output
 
 
-def copy_bits_from_uint16_to_float(i):
-    i = np.uint32(i) << 16
-    return struct.unpack('<f', struct.pack('<I', i))[0]
-
-
-def convert_uint16_to_float(uint16_list):
-    new_output = []
-    for x in np.nditer(uint16_list):
-        new_output.append(np.float32(copy_bits_from_uint16_to_float(x)))
-
-    return np.reshape(new_output, uint16_list.shape).view(np.float32)
+def convert_uint16_to_float(in_list):
+    in_list = np.asarray(in_list)
+    out = np.vectorize(
+        lambda x: struct.unpack('<f', struct.pack('<I', x << 16))[0],
+        otypes=[np.float32])(in_list.flat)
+    return np.reshape(out, in_list.shape)
 
 
 class OpTest(unittest.TestCase):
diff --git a/python/paddle/static/amp/__init__.py b/python/paddle/static/amp/__init__.py
index 604c7c3d2b..bfc1beed55 100644
--- a/python/paddle/static/amp/__init__.py
+++ b/python/paddle/static/amp/__init__.py
@@ -14,5 +14,8 @@
 
 from ...fluid.contrib import mixed_precision
 from ...fluid.contrib.mixed_precision import *
+from ...fluid.contrib.mixed_precision import bf16
+from ...fluid.contrib.mixed_precision.bf16 import *
 
 __all__ = mixed_precision.__all__
+__all__ += bf16.__all__
diff --git a/python/setup.py.in b/python/setup.py.in
index 71d4afdb28..64cfe6e9cc 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -179,6 +179,7 @@ packages=['paddle',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.contrib.extend_optimizer',
           'paddle.fluid.contrib.mixed_precision',
+          'paddle.fluid.contrib.mixed_precision.bf16',
           'paddle.fluid.contrib.layers',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details',
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index a5239e534e..3fb78b0d0a 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -219,6 +219,7 @@ CPU_PARALLEL_JOB = [
     'test_full_op',
     'test_framework_debug_str',
     'test_fp16_utils',
+    'test_bf16_utils',
     'test_fleet_rolemaker_4',
     'test_flags_use_mkldnn',
     'test_filter_by_instag_op',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 2ea3f7654a..6453eb48d7 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -699,4 +699,5 @@ STATIC_MODE_TESTING_LIST = [
     'test_slice_op_xpu',
     'test_generate_proposals_v2_op',
     'test_lamb_op_xpu',
+    'test_model_cast_to_bf16',
 ]
-- 
Gitee


From 6d609267f64bf8123c08c55db575115ce9604d50 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:30:10 +0800
Subject: [PATCH 32/45] !41 [Paddle-TRT] nearest_interp op Merge pull request
 !41 from PaddlePaddle-Gardener/trt_nearest_interp_op

---
 .../fluid/inference/api/analysis_predictor.cc |   2 +
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +
 .../tensorrt/convert/nearest_interp_op.cc     | 114 +++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  22 ++
 .../inference/test_trt_nearest_interp_op.py   | 192 ++++++++++++++++++
 5 files changed, 332 insertions(+)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index fc436311f0..8f2b217a2f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1192,6 +1192,8 @@ USE_TRT_CONVERTER(scale);
 USE_TRT_CONVERTER(stack);
 USE_TRT_CONVERTER(clip);
 USE_TRT_CONVERTER(gather);
+
+USE_TRT_CONVERTER(nearest_interp);
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 59205529ef..b0d0229ec0 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -6,6 +6,8 @@ nv_library(tensorrt_converter
                 shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
                 gather_op.cc
+
+                nearest_interp_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
new file mode 100644
index 0000000000..e91a2ee13f
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class NearestInterpolateOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid nearest_interp op";
+
+    framework::OpDesc op_desc(op, nullptr);
+
+    std::string input_name = op_desc.Input("X").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    auto input = engine_->GetITensor(input_name);
+
+    auto data_layout = framework::StringToDataLayout(
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
+    auto interp_method =
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("interp_method"));
+    bool align_corners =
+        BOOST_GET_CONST(bool, op_desc.GetAttr("align_corners"));
+
+    auto input_names = op_desc.Input("X");
+    auto scale = BOOST_GET_CONST(float, op_desc.GetAttr("scale"));
+    auto out_h = BOOST_GET_CONST(int, op_desc.GetAttr("out_h"));
+    auto out_w = BOOST_GET_CONST(int, op_desc.GetAttr("out_w"));
+
+    auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input);
+    layer->setAlignCorners(align_corners);
+
+    auto in_dim = input->getDimensions();
+
+    float scale_h = 1.f;
+    float scale_w = 1.f;
+
+    std::vector<float> scales;
+
+    if (scale > 0.f && (out_h <= 0 && out_w <= 0)) {
+      scale_h = scale;
+      scale_w = scale;
+    } else {
+      // axis are different in static/dynamic mode
+      PADDLE_ENFORCE_GT(
+          out_h, 0, platform::errors::InvalidArgument(
+                        "out_h must be greater than 0 if scale is not set."));
+      PADDLE_ENFORCE_GT(
+          out_w, 0, platform::errors::InvalidArgument(
+                        "out_w must be greater than 0 if scale is not set."));
+
+      bool with_dynamic = engine_->with_dynamic_shape();
+
+      int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic;
+      int w_axis =
+          (data_layout == framework::DataLayout::kNCHW) + 1 + with_dynamic;
+
+      scale_h =
+          static_cast<float>(out_h) / static_cast<float>(in_dim.d[h_axis]);
+      scale_w =
+          static_cast<float>(out_w) / static_cast<float>(in_dim.d[w_axis]);
+    }
+
+    if (engine_->with_dynamic_shape()) {
+      scales.push_back(1.f);
+    }
+
+    if (data_layout == framework::DataLayout::kNCHW) {
+      scales.push_back(1.f);
+      scales.push_back(scale_h);
+      scales.push_back(scale_w);
+    } else if (data_layout == framework::DataLayout::kNHWC) {
+      // NHWC
+      scales.push_back(scale_h);
+      scales.push_back(scale_w);
+      scales.push_back(1.f);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Data layout must be NCHW or NHWC."));
+    }
+    layer->setScales(scales.data(), scales.size());
+
+    RreplenishLayerAndOutput(layer, "nearest_interp", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(nearest_interp, NearestInterpolateOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 44939606b4..2ec94f5f98 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/data_layout.h"
 
 namespace paddle {
 namespace framework {
@@ -110,6 +111,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "flatten2",
       "flatten",
       "gather",
+
+      "nearest_interp",
   };
 };
 
@@ -187,10 +190,29 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (axis != 1) return false;
       }
     }
+
     if (op_type == "gather") {
       // current not support axis from input, use default 0
       if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
     }
+
+    if (op_type == "nearest_interp") {
+      std::vector<std::string> attrs{"data_layout",   "interp_method",
+                                     "align_corners", "scale",
+                                     "out_h",         "out_w"};
+      for (auto const attr : attrs) {
+        if (!desc.HasAttr(attr)) return false;
+      }
+      auto data_layout = framework::StringToDataLayout(
+          BOOST_GET_CONST(std::string, desc.GetAttr("data_layout")));
+      if (data_layout != framework::DataLayout::kNCHW &&
+          data_layout != framework::DataLayout::kNHWC)
+        return false;
+      auto interp_method =
+          BOOST_GET_CONST(std::string, desc.GetAttr("interp_method"));
+      if (interp_method != "nearest") return false;
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
new file mode 100644
index 0000000000..1a58a6c9dd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTNearestInterpTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            if self.data_layout == 'NCHW':
+                shape = [
+                    -1, self.channels, self.origin_shape[0],
+                    self.origin_shape[1]
+                ]
+            else:
+                shape = [
+                    -1, self.origin_shape[0], self.origin_shape[1],
+                    self.channels
+                ]
+            data = fluid.data(name='data', shape=shape, dtype='float32')
+            resize_out = self.append_nearest_interp(data)
+            out = fluid.layers.batch_norm(resize_out, is_test=True)
+
+        if self.data_layout == 'NCHW':
+            shape = [
+                self.bs, self.channels, self.origin_shape[0],
+                self.origin_shape[1]
+            ]
+        else:
+            shape = [
+                self.bs, self.origin_shape[0], self.origin_shape[1],
+                self.channels
+            ]
+
+        self.feeds = {'data': np.random.random(shape).astype('float32'), }
+        self.enable_trt = True
+        self.trt_parameters = TRTNearestInterpTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def set_params(self):
+        self.bs = 4
+        self.scale = 1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = True
+        self.data_layout = 'NCHW'
+
+    def append_nearest_interp(self, data):
+        if self.scale > 0.:
+            return fluid.layers.resize_nearest(
+                data,
+                scale=self.scale,
+                align_corners=self.align_corners,
+                data_format=self.data_layout)
+        return fluid.layers.resize_nearest(
+            data,
+            out_shape=self.resize_shape,
+            align_corners=self.align_corners,
+            data_format=self.data_layout)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTNearestInterpTest1(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = True
+        self.data_layout = 'NCHW'
+
+
+class TRTNearestInterpTest2(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = 2.
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = False
+        self.data_layout = 'NCHW'
+
+
+class TRTNearestInterpTest3(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = False
+        self.data_layout = 'NCHW'
+
+
+class TRTNearestInterpTest4(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (47, 48)  # HW
+        self.align_corners = False
+        self.data_layout = 'NCHW'
+
+
+class TRTNearestInterpTest5(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = True
+        self.data_layout = 'NHWC'
+
+
+class TRTNearestInterpTest6(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = 2.
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = False
+        self.data_layout = 'NHWC'
+
+
+class TRTNearestInterpTest7(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = False
+        self.data_layout = 'NHWC'
+
+
+class TRTNearestInterpTest8(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (47, 48)  # HW
+        self.align_corners = False
+        self.data_layout = 'NHWC'
+
+
+class TRTNearestInterpTest9(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (47, 48)  # HW
+        self.align_corners = False
+        self.data_layout = 'NHWC'
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
Gitee


From 7f04c755824c2e46af3f1ba6c6f4a202c5342a7e Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:30:11 +0800
Subject: [PATCH 33/45] !42 update approval Merge pull request !42 from
 PaddlePaddle-Gardener/approve

---
 tools/check_api_approvals.sh       |  4 +--
 tools/check_file_diff_approvals.sh | 44 +++++++++++++++---------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 1db3f6d3d2..4e8ea25715 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -61,8 +61,8 @@ DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_gr
 PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_PR.spec
 ADDED_OP_USE_DEFAULT_GRAD_MAKER=`python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py ${DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC} ${PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC}` 
 if [ "${ADDED_OP_USE_DEFAULT_GRAD_MAKER}" != "" ]; then
-  echo_line="You must have one RD (sneaxiy (Recommend) or luotao1) approval because you use DefaultGradOpMaker for ${ADDED_OP_USE_DEFAULT_GRAD_MAKER}, which manages the grad_op memory optimization.\n" 
-  check_approval 1 32832641 6836917
+  echo_line="You must have one RD (zhiqiu (Recommend) or zhhsplendid) approval because you use DefaultGradOpMaker for ${ADDED_OP_USE_DEFAULT_GRAD_MAKER}, which manages the grad_op memory optimization.\n" 
+  check_approval 1 6888866 7913861
 fi
 
 if [ -n "${echo_list}" ];then
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index fd3175a572..f3bf3ea508 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -92,11 +92,11 @@ for API_FILE in ${API_FILES[*]}; do
       # You can use http://caius.github.io/github_id/ to find Github user id.
       # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676, Dong Daxiang 35550832, phlrain 43953930.
       if [ "${API_FILE}" == "CMakeLists.txt" ];then
-          echo_line="You must have one RD (luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
-          check_approval 1 6836917 46782768
+          echo_line="You must have one RD (wanghuancoder, luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
+          check_approval 1 6836917 46782768 26922892
       elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-          echo_line="You must have one RD (lanxianghit (Recommend) or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
-          check_approval 1 6836917 47554610
+          echo_line="You must have one RD (lanxianghit (Recommend), phlrain or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
+          check_approval 1 6836917 47554610 43953930
       elif [ "${API_FILE}" == "python/requirements.txt" ];then
           echo_line="You must have one RD (phlrain) and one TPM (swtkiwi) and one QA (kolinwei) approval for python/requirements.txt, which manages the third-party python package.\n"
           check_approval 3 43953930 27208573 22165420
@@ -104,8 +104,8 @@ for API_FILE in ${API_FILES[*]}; do
           echo_line="You must have one RD (gongweibao or seiriosPlus) approval for the paddle/fluid/operators/distributed/send_recv.proto.in, which manages the environment variables.\n"
           check_approval 1 10721757 5442383
       elif [ "${API_FILE}" == "paddle/fluid/framework/unused_var_check.cc" ];then
-          echo_line="You must have one RD (zhiqiu (Recommend) or luotao1) approval for the changes of paddle/fluid/framework/unused_var_check.cc, which manages the allow list of operators that have unused input variables. Before change the allow list, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n"
-          check_approval 1 6888866 6836917
+          echo_line="You must have one RD (zhiqiu (Recommend) or chenwhql) approval for the changes of paddle/fluid/framework/unused_var_check.cc, which manages the allow list of operators that have unused input variables. Before change the allow list, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n"
+          check_approval 1 6888866 22561442
       elif [ "${API_FILE}" == "paddle/fluid/pybind/op_function_generator.cc" ];then
           echo_line="You must have one RD (zhiqiu (Recommend) , phlrain) approval for the changes of paddle/fluid/pybind/op_function_generator.cc, which manages the logic of automatic generating op functions for dygraph. \n"
           check_approval 1 6888866 43953930
@@ -122,14 +122,14 @@ for API_FILE in ${API_FILES[*]}; do
           echo_line="You must have one RD (cryoco (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py, which manages the white list of setting no_check_set of check_output. \n"
           check_approval 1 12407750 6836917 43953930
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py" ]; then
-          echo_line="You must have one RD (luotao1, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n"
-          check_approval 1 6836917 43953930
+          echo_line="You must have one RD (luotao1, lanxianghit, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n"
+          check_approval 1 6836917 43953930 47554610
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py" ];then
           echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py, which manages the white list of error threshold for op test with float64 precision. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n"
           check_approval 1 52520497 26615455 6836917
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py" ];then
-          echo_line="You must have one RD (luotao1 or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n"
-          check_approval 1 6836917 43953930
+          echo_line="You must have one RD (luotao1, lanxianghit or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n"
+          check_approval 1 6836917 43953930 47554610
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then
           echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
           check_approval 1 39303645 6836917 43953930
@@ -143,17 +143,17 @@ for API_FILE in ${API_FILES[*]}; do
 	      echo_line="You must have (fuyinno4 (Recommend), raindrops2sea) approval for ${API_FILE} changes"
 	      check_approval 1 35824027 38231817
       elif [ "${API_FILE}" == "paddle/scripts/paddle_build.bat" ] || [ "${API_FILE}" == "tools/windows/run_unittests.sh" ]; then
-	      echo_line="You must have one RD (zhouwei25 (Recommend), luotao1) approval for ${API_FILE} changes, which manages the Paddle CI task on Windows.\n"
-	      check_approval 1 52485244 6836917
+	      echo_line="You must have one RD (zhouwei25 (Recommend), wanghuancoder, luotao1) approval for ${API_FILE} changes, which manages the Paddle CI task on Windows.\n"
+	      check_approval 1 52485244 6836917 26922892
       elif [ "${API_FILE}" == "tools/parallel_UT_rule.py" ]; then
-	      echo_line="You must have one RD (zhouwei25 (Recommend), luotao1) approval for ${API_FILE} changes, which manages the rule of running unittest with a same GPU. If the unittest failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, you can remove it from ${API_FILE}.\n"
-	      check_approval 1 52485244 6836917
+	      echo_line="You must have one RD (zhouwei25 (Recommend), wanghuancoder, luotao1) approval for ${API_FILE} changes, which manages the rule of running unittest with a same GPU. If the unittest failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, you can remove it from ${API_FILE}.\n"
+	      check_approval 1 52485244 6836917 26922892
       elif [ "${API_FILE}" == "python/paddle/fluid/parallel_executor.py" ]; then
           echo_line="You must have one RD (Xreki,luotao1,zhhsplendid) approval for ${API_FILE}, which manages the underlying code for PaddlePaddle.\n"
           check_approval 1 12538138 6836917 7913861
       else
-          echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
-          check_approval 1 46782768 12538138 6836917
+          echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
+          check_approval 1 46782768 12538138 6836917 22561442 6888866
       fi
   fi
 done
@@ -161,8 +161,8 @@ done
 FILTER=`git diff --name-only upstream/develop | grep -v "tools/"`
 HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH $FILTER | grep '^\+' | grep -o -m 1 "const_cast" || true`
 if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for the usage of const_cast.\n"
-    check_approval 1 46782768 12538138 6836917
+    echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1) approval for the usage of const_cast.\n"
+    check_approval 1 46782768 12538138 6836917 22561442 6888866
 fi
 
 HAS_BOOST_GET=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "boost::get" || true`
@@ -185,14 +185,14 @@ fi
 
 HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
 if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
-    check_approval 1 22165420 6836917 46661762
+    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
+    check_approval 1 22165420 6836917 46661762 26922892
   fi
 
 HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true`
 if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (Superjomn (Recommend), luotao1) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n"
-    check_approval 1 328693 6836917
+    echo_line="You must have one RD (Superjomn (Recommend), Shixiaowei02, luotao1) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n"
+    check_approval 1 328693 6836917 39303645
   fi
 
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
-- 
Gitee


From 5884d5b94ab120e156e796e0a948fd8dd0da9284 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:30:11 +0800
Subject: [PATCH 34/45] !43 Fix `test_fleet_launch_ps`. Merge pull request !43
 from PaddlePaddle-Gardener/fixlaunchps

---
 .../tests/unittests/test_fleet_launch_ps.sh   | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh
index 67a8d7e575..0f28be614c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh
@@ -16,18 +16,24 @@
 
 set -e
 
-server_port_0=${PADDLE_DIST_UT_PORT}
-server_port_1=$(( PADDLE_DIST_UT_PORT + 1 ))
-worker_port_0=$(( PADDLE_DIST_UT_PORT + 2 ))
-worker_port_1=$(( PADDLE_DIST_UT_PORT + 3 ))
-heter_worker_port_0=$(( PADDLE_DIST_UT_PORT + 4 ))
-heter_worker_port_1=$(( PADDLE_DIST_UT_PORT + 5 ))
+server_port_00=${PADDLE_DIST_UT_PORT}
+server_port_10=$(( PADDLE_DIST_UT_PORT + 1 ))
+worker_port_00=$(( PADDLE_DIST_UT_PORT + 2 ))
+worker_port_10=$(( PADDLE_DIST_UT_PORT + 3 ))
+
+server_port_01=$(( PADDLE_DIST_UT_PORT + 4 ))
+server_port_11=$(( PADDLE_DIST_UT_PORT + 5 ))
+worker_port_01=$(( PADDLE_DIST_UT_PORT + 6 ))
+worker_port_11=$(( PADDLE_DIST_UT_PORT + 7 ))
+
+heter_worker_port_0=$(( PADDLE_DIST_UT_PORT + 8 ))
+heter_worker_port_1=$(( PADDLE_DIST_UT_PORT + 9 ))
 
 function test_launch_ps(){
 
     python -m paddle.distributed.fleet.launch \
-        --servers="127.0.0.1:${server_port_0},127.0.0.1:${server_port_1}" \
-        --workers="127.0.0.1:${worker_port_0},127.0.0.1:${worker_port_1}" \
+        --servers="127.0.0.1:${server_port_00},127.0.0.1:${server_port_10}" \
+        --workers="127.0.0.1:${worker_port_00},127.0.0.1:${worker_port_10}" \
         fleet_ps_training.py 2> ut.elog
     if grep -q "server are killed" ut.elog; then
         echo "test pserver launch succeed"
@@ -39,8 +45,8 @@ function test_launch_ps(){
 
 function test_launch_ps_heter(){
     python -m paddle.distributed.fleet.launch \
-        --servers="127.0.0.1:${server_port_0},127.0.0.1:${server_port_1}" \
-        --workers="127.0.0.1:${worker_port_0},127.0.0.1:${worker_port_1}" \
+        --servers="127.0.0.1:${server_port_01},127.0.0.1:${server_port_11}" \
+        --workers="127.0.0.1:${worker_port_01},127.0.0.1:${worker_port_11}" \
         --heter_workers="127.0.0.1:${heter_worker_port_0},127.0.0.1:${heter_worker_port_1}" \
         fleet_ps_training.py 2> ut.elog
     if grep -q "server are killed" ut.elog; then
-- 
Gitee


From 9c61bd40fd88a7127d4798ab871ead24039e6a33 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:30:12 +0800
Subject: [PATCH 35/45] !44 [ROCM] fix reduce_sum nan in ROCM platform,
 test=develop Merge pull request !44 from
 PaddlePaddle-Gardener/rocm_fix_reduce

---
 paddle/fluid/operators/reduce_ops/cub_reduce.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/operators/reduce_ops/cub_reduce.h b/paddle/fluid/operators/reduce_ops/cub_reduce.h
index 39cce60faf..29e46e091d 100644
--- a/paddle/fluid/operators/reduce_ops/cub_reduce.h
+++ b/paddle/fluid/operators/reduce_ops/cub_reduce.h
@@ -161,7 +161,11 @@ static inline std::vector<int> GetStrides(const std::vector<int>& dims,
   return strides;
 }
 
+#ifdef __HIPCC__
+constexpr int kMaxBlockDim = 256;
+#else
 constexpr int kMaxBlockDim = 512;
+#endif
 
 static inline int GetDesiredBlockDim(int block_dim) {
   return block_dim >= kMaxBlockDim
-- 
Gitee


From 04082dbaa43af0b9b4bd86e5e4b19dc08932fd4d Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:34:12 +0800
Subject: [PATCH 36/45] !45 fix tensorrt output varible reshape Merge pull
 request !45 from PaddlePaddle-Gardener/fix_fc_1

---
 .../ir_passes/tensorrt_subgraph_pass.cc       |  8 ++-
 .../fluid/inference/tensorrt/convert/fc_op.cc | 70 +++++++++++++++++--
 .../tensorrt/convert/multihead_matmul_op.cc   | 43 +++++++++---
 .../inference/tensorrt/convert/softmax_op.cc  | 13 ++--
 paddle/fluid/inference/tensorrt/op_teller.cc  | 12 +++-
 .../plugin/emb_eltwise_layernorm_plugin.cu    |  4 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |  4 +-
 .../plugin/skip_layernorm_op_plugin.cu        |  5 --
 .../tensorrt/plugin/special_slice_plugin.cu   |  2 +
 9 files changed, 125 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 59ed09b96c..60de4234b4 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -168,11 +168,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   std::set<std::string> output_names;
   std::set<std::string> output_names_with_id;
-  std::vector<int> origin_output_dims;
+  std::map<std::string, int> origin_name_output_dims;
   for (auto *x : node->outputs) {
     output_names.insert(x->Name());
     output_names_with_id.insert(x->Name() + std::to_string(x->id()));
-    origin_output_dims.push_back(x->Var()->GetShape().size());
+    origin_name_output_dims[x->Name()] = x->Var()->GetShape().size();
   }
 
   std::unordered_map<std::string, std::string> output_name_map;
@@ -216,11 +216,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // output_mapping help us copy the data from the renamed ITensor
   // to Tensor.
   std::vector<std::string> output_mapping;
+  std::vector<int> renamed_output_dims;
   for (auto name : output_names) {
     PADDLE_ENFORCE_NE(output_name_map.count(name), 0,
                       platform::errors::PreconditionNotMet(
                           "The output_name_map should have %s", name));
     output_mapping.push_back(output_name_map[name]);
+    renamed_output_dims.push_back(origin_name_output_dims[name]);
   }
   PADDLE_ENFORCE_EQ(output_mapping.empty(), false,
                     platform::errors::PreconditionNotMet(
@@ -243,7 +245,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
   op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
   op_desc->SetAttr("output_name_mapping", output_mapping);
-  op_desc->SetAttr("origin_output_dims", origin_output_dims);
+  op_desc->SetAttr("origin_output_dims", renamed_output_dims);
   op_desc->SetAttr("parameters", params);
 
   // we record all inputs' shapes in attr to check if they are consistent
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 41fbbb557d..527d0ee208 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -144,7 +144,69 @@ class FcOpConverter : public OpConverter {
                                 static_cast<size_t>(bias_num)};
 
     if (engine_->with_dynamic_shape()) {
-      regist_fc(X, n_output, weight, bias);
+      // not NCHW layout, but NLP layout with added 'x 1 x 1'
+      auto x_dim = X->getDimensions();
+      if (x_dim.nbDims == 3 || x_dim.nbDims == 2) {
+        auto output_name = op_desc.Output("Out").front();
+        // add shuffle before fc
+        nvinfer1::Dims reshape_before_fc_dim;
+        reshape_before_fc_dim.nbDims = x_dim.nbDims + 2;
+        for (int i = 0; i < x_dim.nbDims; i++) {
+          reshape_before_fc_dim.d[i] = 0;
+        }
+        reshape_before_fc_dim.d[x_dim.nbDims] = 1;
+        reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1;
+        auto* reshape_before_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+        reshape_before_fc_layer->setName(
+            ("shuffle_before_fc(Output: " + output_name + ")").c_str());
+
+        // add fc layer
+        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
+            n_output, weight.get(), bias.get());
+        fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
+
+        // add shuffle after fc
+        nvinfer1::Dims reshape_after_fc_dim;
+        if (x_dim.nbDims == 3) {
+          if (x_num_col_dims == 2) {
+            reshape_after_fc_dim.nbDims = 3;
+            reshape_after_fc_dim.d[0] = 0;
+            reshape_after_fc_dim.d[1] = 0;
+            reshape_after_fc_dim.d[2] = 0;
+          } else {
+            reshape_after_fc_dim.nbDims = 2;
+            reshape_after_fc_dim.d[0] = 0;
+            auto dim = fc_layer->getOutput(0)->getDimensions();
+            reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2];
+          }
+          // x_dim.nbDims == 2
+        } else {
+          reshape_after_fc_dim.nbDims = 2;
+          reshape_after_fc_dim.d[0] = 0;
+          reshape_after_fc_dim.d[1] = 0;
+        }
+        auto* reshape_after_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
+        reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
+
+        if (activation_type == "relu") {
+          reshape_after_fc_layer->setName(
+              ("shuffle_after_fc(Output: " + output_name + ")").c_str());
+          nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
+              nvinfer1::ActivationType::kRELU);
+          RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
+                                   {output_name}, test_mode);
+        } else {
+          RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
+                                   {output_name}, test_mode);
+        }
+      } else {
+        regist_fc(X, n_output, weight, bias);
+      }
       return;
     }
     // in order to handle situations in NLP models(input dims < 3,
@@ -154,12 +216,6 @@ class FcOpConverter : public OpConverter {
     auto input_d = X->getDimensions().d;
     int reshape_dim3[3] = {0};
     int reshape_dim4[4] = {0};
-    PADDLE_ENFORCE_EQ(
-        x_num_col_dims == 1 || x_num_col_dims == 2, true,
-        platform::errors::InvalidArgument(
-            "Wrong x_num_col_dims param of op mul. Paddle-TRT FC converter "
-            "expects x_num_col_dims is either 1 or 2, but got %d",
-            x_num_col_dims));
     PADDLE_ENFORCE_LE(x_num_col_dims, input_dims,
                       platform::errors::InvalidArgument(
                           "Params and input dims mismatch. Paddle-TRT FC "
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index ee04fd372c..8ce46a19d4 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -8,8 +8,8 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
@@ -28,7 +28,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
                "network structure";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
-    // Shouble be a 5 dims tensor.
     auto* input = engine_->GetITensor(op_desc.Input("Input").front());
 
     // fc weights and fc bias
@@ -69,6 +68,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
     int head_number = BOOST_GET_CONST(int, op_desc.GetAttr("head_number"));
 
     nvinfer1::ILayer* layer = nullptr;
+    auto output_name = op_desc.Output("Out")[0];
 
     if (engine_->with_dynamic_shape()) {
       if (engine_->use_oss()) {
@@ -171,6 +171,12 @@ class MultiheadMatMulOpConverter : public OpConverter {
             plugin_inputs.data(), plugin_inputs.size(), *plugin);
         layer = plugin_layer;
       } else {
+        PADDLE_ENFORCE_EQ(
+            input->getDimensions().nbDims, 3,
+            platform::errors::InvalidArgument(
+                "The Input dim of the MultiheadMatMul should be 3, "
+                "but it's (%d) now.",
+                input->getDimensions().nbDims));
         // transpose weight_data from m * n to  n * m
         auto* input_bias_qk =
             engine_->GetITensor(op_desc.Input("BiasQK").front());
@@ -184,15 +190,37 @@ class MultiheadMatMulOpConverter : public OpConverter {
                                     static_cast<void*>(bias_data),
                                     static_cast<size_t>(bias_t->numel())};
 
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
-                                              n, weight.get(), bias.get());
-        auto* fc_out = fc_layer->getOutput(0);
+        // add shuffle before fc
+        nvinfer1::Dims reshape_before_fc_dim;
+        reshape_before_fc_dim.nbDims = 5;
+        reshape_before_fc_dim.d[0] = 0;
+        reshape_before_fc_dim.d[1] = 0;
+        reshape_before_fc_dim.d[2] = 0;
+        reshape_before_fc_dim.d[3] = 1;
+        reshape_before_fc_dim.d[4] = 1;
+        auto* reshape_before_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+        reshape_before_fc_layer->setName(
+            ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
+                .c_str());
+
+        // add layer fc
+        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n,
+            weight.get(), bias.get());
+        fc_layer->setName(
+            ("multihead_mamul_fc(Output: " + output_name + ")").c_str());
+
+        // no need to add shuffle after fc, just change it in
+        // QkvToContextPluginDynamic
+
         // add qkv to context
         int head_size = hidden_out / head_number;
         float scale = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
 
         std::vector<nvinfer1::ITensor*> plugin_inputs;
-        plugin_inputs.push_back(fc_out);
+        plugin_inputs.push_back(fc_layer->getOutput(0));
         plugin_inputs.push_back(input_bias_qk);
         bool with_fp16 =
             engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
@@ -208,7 +236,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
           "You can use the config.SetTRTDynamicShapeInfo(...) interface to set "
           "the shape information to run the dynamic shape mode."));
     }
-    auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name},
                              test_mode);
 #else
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 79992065a2..9cefb24751 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -51,6 +51,7 @@ class SoftMaxOpConverter : public OpConverter {
     uint32_t axes = std::max(0, input_dims - 3);
     // TODO(cryoco): Poor workaround. Fix padded dims problem when TRT layers
     // support Nd.
+    // Tips: Dynammic shape alreay fixes.
     int padded_dims = 0;
     int explicit_batch = 0;
     if (engine_->with_dynamic_shape()) explicit_batch = 1;
@@ -62,16 +63,16 @@ class SoftMaxOpConverter : public OpConverter {
       }
     }
     if (!engine_->with_dynamic_shape()) {
-      if (axis == -1) {
-        axes = input_dims - 1 - padded_dims;
+      if (axis < 0) {
+        axes = input_dims + axis - padded_dims;
       } else {
-        axes = axis;
+        axes = axis - 1;
       }
     } else {
-      if (axis == -1) {
-        axes = input_dims - 1 - padded_dims;
+      if (axis < 0) {
+        axes = input_dims + axis;
       } else {
-        axes = axis + 1;
+        axes = axis;
       }
     }
     layer->setAxes(1 << axes);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 2ec94f5f98..11752d71a4 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -195,7 +195,17 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       // current not support axis from input, use default 0
       if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
     }
-
+    if (op_type == "fc" || op_type == "mul") {
+      const int x_num_col_dims =
+          desc.HasAttr("x_num_col_dims")
+              ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
+              : (desc.HasAttr("in_num_col_dims")
+                     ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
+                     : 1);
+      if (x_num_col_dims != 1 && x_num_col_dims != 2) {
+        return false;
+      }
+    }
     if (op_type == "nearest_interp") {
       std::vector<std::string> attrs{"data_layout",   "interp_method",
                                      "align_corners", "scale",
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 238daa4a88..6d3872aaeb 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -200,12 +200,10 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
                         "but it's (%d)",
                         output_index));
   nvinfer1::DimsExprs ret;
-  ret.nbDims = 5;
+  ret.nbDims = 3;
   ret.d[0] = inputs[0].d[0];
   ret.d[1] = inputs[0].d[1];
   ret.d[2] = expr_builder.constant(hidden_size_);
-  ret.d[3] = expr_builder.constant(1);
-  ret.d[4] = expr_builder.constant(1);
   return ret;
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 1e7c83f4c6..a5fc9e73c5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -169,12 +169,10 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
           "it has (%d) inputs",
           nb_inputs));
   nvinfer1::DimsExprs ret;
-  ret.nbDims = 5;
+  ret.nbDims = 3;
   ret.d[0] = inputs[0].d[0];
   ret.d[1] = inputs[0].d[1];
   ret.d[2] = expr_builder.constant(head_size_ * head_number_);
-  ret.d[3] = expr_builder.constant(1);
-  ret.d[4] = expr_builder.constant(1);
   return ret;
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
index 3b9eea2219..7be9e3a740 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -54,11 +54,6 @@ void SkipLayerNormPluginDynamic::terminate() {
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {
-  PADDLE_ENFORCE_EQ(
-      inputs[0].nbDims, 5,
-      platform::errors::InvalidArgument(
-          "The Input dim of the SkipLayernorm should be 5, but it's (%d) now.",
-          inputs[0].nbDims));
   return inputs[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
index 250b944652..fdb14f9cea 100644
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
@@ -62,6 +62,8 @@ nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions(
   output.d[1] = one;
   output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB,
                                        *inputs[1].d[0], *one);
+  // remove padding 1
+  output.nbDims -= 2;
 
   return output;
 }
-- 
Gitee


From 08a73ca98201e37f9324a3733afe7b5d2a5e0d33 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:34:12 +0800
Subject: [PATCH 37/45] !46 Delete fast_check_nan_inf Merge pull request !46
 from PaddlePaddle-Gardener/del_fast_check_nan_inf

---
 paddle/fluid/framework/operator.cc | 22 --------
 python/paddle/fluid/__init__.py    |  1 -
 python/paddle/fluid/debugger.py    | 85 ------------------------------
 3 files changed, 108 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 833a28a757..834cdb422a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -47,9 +47,6 @@ DECLARE_bool(benchmark);
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
 DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
-DEFINE_bool(fast_check_nan_inf, false,
-            "Fast checking NAN/INF after each operation. It will be a little"
-            "bit slow, much faster than check_nan_inf");
 
 namespace paddle {
 namespace framework {
@@ -1173,25 +1170,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #endif
   }
 
-  if (FLAGS_fast_check_nan_inf) {
-    for (auto& vname : OutputVars(true)) {
-      // only check inserted vars,
-      // please see executor.py for details of fast_check_nan_inf
-      if (vname.rfind("debug_var") == 0) {
-        VLOG(3) << "debugging nan/inf in var " << vname;
-
-        auto* var = exec_scope.FindVar(vname);
-        if (var == nullptr) continue;
-        if (var->IsType<framework::LoDTensor>()) {
-          CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
-        } else if (var->IsType<framework::SelectedRows>()) {
-          CheckTensorNANOrInf(type_, vname,
-                              var->Get<framework::SelectedRows>().value());
-        }
-      }
-    }
-  }
-
   if (FLAGS_check_nan_inf) {
     framework::details::CheckOpHasNanOrInf(*this, exec_scope, place);
   }
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 1a88d3512e..b24da29d0f 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -175,7 +175,6 @@ def __bootstrap__():
     sysstr = platform.system()
     read_env_flags = [
         'check_nan_inf',
-        'fast_check_nan_inf',
         'benchmark',
         'eager_delete_scope',
         'fraction_of_cpu_memory_to_use',
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
index 9110b8daf3..75dc14a1d7 100644
--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -280,88 +280,3 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
             add_op_link_var(opn, var, True)
 
     graph(path, show=False)
-
-
-def prepare_fast_nan_inf_debug(_program):
-    """
-    Given a program to run, insert a (reduce) sum op for every var in that program.
-    Instead of checking all vars originally defined in the program,
-    only those inserted ops will be checked in the c++ end, to detect if it contains NAN or INF.
-    Thereforce, the speed of nan/inf checking could be improved.
-    Please set ``FLAGS_fast_check_nan_inf" to open the fast nan/inf feature.
-    """
-
-    helper = LayerHelper('reduce_sum', **locals())
-
-    if _program is None:
-        _program = default_main_program()
-
-    for _block in _program.blocks:
-        # fetch vars in the current block
-        _vars_in_prog = []
-        for _var_name in _block.vars:
-            _vars_in_prog.append((_var_name, _block.vars[_var_name]))
-
-        # append sum_op in the current block
-        for _var_name, _var in _vars_in_prog:
-
-            try:
-
-                if _var.dtype == -1:
-                    continue
-
-                ## create a var for holding sum output
-                _output_var = _block.create_var(
-                    name=unique_name.generate("debug_var_" + _var_name),
-                    dtype=_var.dtype,
-                    type=core.VarDesc.VarType.LOD_TENSOR,
-                    persistable=False,
-                    stop_gradient=True)
-
-                ## create a sum op, input each existing var in the block
-                _block.append_op(
-                    type='sum',
-                    outputs={'Out': _output_var},
-                    inputs={'X': [_var]})
-            except Exception as e:
-                pass
-
-
-def run_fast_nan_inf_debug(executor,
-                           program=None,
-                           feed=None,
-                           fetch_list=None,
-                           feed_var_name='feed',
-                           fetch_var_name='fetch',
-                           scope=None,
-                           return_numpy=True,
-                           use_program_cache=False,
-                           dump_core=True):
-    """
-    Run a program by the given executor. Catch the exception of NAN and INF, and save persistables into the dumped core.
-    """
-
-    assert (executor is not None)
-
-    try:
-        output = executor.run(program=program,
-                              feed=feed,
-                              fetch_list=fetch_list,
-                              feed_var_name=feed_var_name,
-                              fetch_var_name=fetch_var_name,
-                              scope=scope,
-                              return_numpy=return_numpy,
-                              use_program_cache=use_program_cache)
-
-        return output
-
-    except Exception as e:
-
-        print("catch an exception:")
-        print(e)
-
-        core_filename = "core" + str(int(random.random() * 10000)) + ".pdckpt"
-        io.save_persistables(
-            executor, "./", main_program=program, filename=core_filename)
-
-        print("dumping a core into ./%s" % core_filename)
-- 
Gitee


From beed999f8532a529a29b74d82e2a3ab386aa12ff Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:34:13 +0800
Subject: [PATCH 38/45] !47 update scale collection and propagation algorithm
 Merge pull request !47 from PaddlePaddle-Gardener/wojtuss/fix-for-31103

---
 .../quantization/quant2_int8_mkldnn_pass.py   | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index d93a2059bd..68cc8106c9 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -62,9 +62,8 @@ class Quant2Int8MkldnnPass(object):
         self._ops_to_quantize = _ops_to_quantize
         self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set(
             [-1])
-        self._scale_immutable_ops = [
-            'transpose2', 'reshape2', 'pool2d', 'scale'
-        ]
+        self._scale_immutable_ops = ['transpose2', 'reshape2', 'pool2d']
+        self._scale_ops = ['scale']
         self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._pool_ops = ['pool2d']
         self._mul_ops = ['mul']
@@ -87,8 +86,8 @@ class Quant2Int8MkldnnPass(object):
         self._reset_pass_idx_and_group('int8')
         graph = self._label_skip_quantized_op(graph)
         graph = self._gather_weight_thresholds_from_fake(graph)
-        graph = self._gather_output_scales_from_attr(graph)
         graph = self._gather_input_scales_from_fake(graph)
+        graph = self._gather_output_scales_from_attr(graph)
         graph = self._remove_fake_ops(graph)
         graph = self._dequantize_weights(graph)
         graph = self._optimize_fp32_graph(graph)
@@ -160,12 +159,16 @@ class Quant2Int8MkldnnPass(object):
                     op_node.op()._set_attr("skip_quant", True)
         return graph
 
-    def _gather_input_scales_from_fake(self, graph):
-        def _add_scale_for_vars(var_names, use_unsigned_int, lod_tensor):
-            scales = self._var_quant_scales
-            for var_name in var_names:
+    def _add_scale_for_vars(self, var_names, use_unsigned_int, lod_tensor):
+        """
+        Save quantization scales for variables. Do not overwrite.
+        """
+        scales = self._var_quant_scales
+        for var_name in var_names:
+            if var_name not in scales:
                 scales[var_name] = (use_unsigned_int, lod_tensor)
 
+    def _gather_input_scales_from_fake(self, graph):
         # fake_quantize_dequantize_abs_max doesn't have scale value
         fake_ops = ['fake_quantize_dequantize_moving_average_abs_max']
         fake_ops.extend(self._fake_quantize_types)
@@ -185,8 +188,8 @@ class Quant2Int8MkldnnPass(object):
                 scale[scale == np.Inf] = 0.0
                 lod_tensor = self._convert_scale2tensor(scale)
                 use_unsigned_int = False
-                _add_scale_for_vars([input_name, output_name], use_unsigned_int,
-                                    lod_tensor)
+                self._add_scale_for_vars([input_name, output_name],
+                                         use_unsigned_int, lod_tensor)
 
         return graph
 
@@ -219,8 +222,8 @@ class Quant2Int8MkldnnPass(object):
                 use_unsigned_int = False
                 for output_name in op.op().outputs():
                     for out_var_name in op.op().output(output_name):
-                        self._var_quant_scales[out_var_name] = (
-                            use_unsigned_int, scale_lod_tensor)
+                        self._add_scale_for_vars(
+                            [out_var_name], use_unsigned_int, scale_lod_tensor)
 
         return graph
 
@@ -239,24 +242,21 @@ class Quant2Int8MkldnnPass(object):
                     output_name = op.output("Out")[0]
                     tensor_names = [input_name, output_name]
 
-                    # Scale is not quantized, so if it doesn't have any scales
-                    # to propagate, its tensors won't be added to the waiting list.
-                    if all(name not in self._var_quant_scales for name in tensor_names) \
-                            and op.name() != 'scale':
+                    if all(name not in self._var_quant_scales
+                           for name in tensor_names):
                         waiting_for_scale.update(tensor_names)
                         continue
-
-                    if input_name in self._var_quant_scales:
+                    elif input_name in self._var_quant_scales:
                         self._var_quant_scales[
                             output_name] = self._var_quant_scales[input_name]
                     elif output_name in self._var_quant_scales:
-                        if op.name() == 'scale':
-                            _update_scale_op_in_scale(op, input_name,
-                                                      output_name)
-                        else:
-                            self._var_quant_scales[
-                                input_name] = self._var_quant_scales[
-                                    output_name]
+                        self._var_quant_scales[
+                            input_name] = self._var_quant_scales[output_name]
+                elif op.name() in self._scale_ops:
+                    input_name = op.input("X")[0]
+                    output_name = op.output("Out")[0]
+                    if output_name in self._var_quant_scales:
+                        _update_scale_op_in_scale(op, input_name, output_name)
             return waiting_for_scale
 
         waiting_for_scale = _update_scales(graph)
-- 
Gitee


From dbc552c9b7574d54dbb6490d5f1a8cbbfedd09fc Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:34:14 +0800
Subject: [PATCH 39/45] !48 add relu forward kernel and backward kernel Merge
 pull request !48 from PaddlePaddle-Gardener/relu_forward_backward

---
 paddle/fluid/operators/activation_op.cu | 284 +++++++++++++++++++++++-
 1 file changed, 283 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 2033081af2..29498da0f0 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -10,8 +10,276 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/math/math_cuda_utils.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using float16 = paddle::platform::float16;
+
+template <typename T>
+struct CudaVecType {
+  using type = T;
+  static constexpr int vecsize = 1;
+};
+
+template <>
+struct CudaVecType<platform::float16> {
+  using type = __half2;
+  static constexpr int vecsize = 2;
+};
+
+template <>
+struct CudaVecType<float> {
+  using type = float4;
+  static constexpr int vecsize = 4;
+};
+
+template <typename T>
+class BaseGPUFunctor {
+ public:
+  using ELEMENT_TYPE = T;
+};
+
+/* ========================================================================== */
+
+/* ===========================    relu forward   ============================ */
+template <typename T>
+class ReluGPUFuctor : public BaseGPUFunctor<T> {
+ private:
+  T zero_;
+
+ public:
+  ReluGPUFuctor() { zero_ = static_cast<T>(0.0f); }
+
+  // for relu forward when T is double
+  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
+      const typename CudaVecType<T>::type* x);
+
+  // when num % vecsize != 0 this func will be used
+  __device__ __forceinline__ T ComputeRemainder(const T x) {
+    return x > zero_ ? x : zero_;
+  }
+};
+
+template <>
+__device__ __forceinline__ CudaVecType<double>::type
+ReluGPUFuctor<double>::Compute(const CudaVecType<double>::type* x) {
+// relu forward : out = max(x, 0)
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
+  return __ldg(x) > zero_ ? __ldg(x) : zero_;
+#else
+  return (*x) > zero_ ? (*x) : zero_;
+#endif
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float>::type
+ReluGPUFuctor<float>::Compute(const CudaVecType<float>::type* xx) {
+  // relu forward : out = max(xx, 0)
+  return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y),
+                     (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w));
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float16>::type
+ReluGPUFuctor<float16>::Compute(const CudaVecType<float16>::type* in) {
+// relu forward : out = max(in, 0)
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
+  const half2 kzero = __float2half2_rn(0.0f);
+  return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in));
+#else
+  const float2 xx = __half22float2(*in);
+  return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(xx.x),
+                           (xx.y > 0.0f) * static_cast<float>(xx.y));
+#endif
+}
+/* ========================================================================== */
+
+/* ===========================    relu backward   ============================
+ */
+
+template <typename T>
+class ReluGradGPUFunctor : public BaseGPUFunctor<T> {
+ private:
+  T zero_;
+
+ public:
+  ReluGradGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+
+  // for relu backward when T is double
+  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
+      const typename CudaVecType<T>::type* out,
+      const typename CudaVecType<T>::type* dout);
+
+  // when num % vecsize != 0 this func will be used
+  __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) {
+    // relu backward : dx = out > 0 ? dout : 0;
+    return out > zero_ ? dout : zero_;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <>
+__device__ __forceinline__ CudaVecType<double>::type
+ReluGradGPUFunctor<double>::Compute(const CudaVecType<double>::type* out,
+                                    const CudaVecType<double>::type* dout) {
+// relu backward : dx = out > 0 ? dout : 0;
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
+  return __ldg(out) > zero_ ? __ldg(dout) : zero_;
+#else
+  return (*out) > zero_ ? (*dout) : zero_;
+#endif
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float>::type
+ReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type* out,
+                                   const CudaVecType<float>::type* dout) {
+  // relu backward : dx = out > 0 ? dout : 0;
+  return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y),
+                     (out->z > zero_) * (dout->z),
+                     (out->w > zero_) * (dout->w));
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float16>::type
+ReluGradGPUFunctor<float16>::Compute(const CudaVecType<float16>::type* out,
+                                     const CudaVecType<float16>::type* dout) {
+// relu backward : dx = out > 0 ? dout : 0;
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
+  const half2 kzero = __float2half2_rn(0.0f);
+  return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout));
+#else
+  const float2 xx = __half22float2(*out);
+  const float2 yy = __half22float2(*dout);
+  return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(yy.x),
+                           (xx.y > 0.0f) * static_cast<float>(yy.y));
+#endif
+}
+
+/* ========================================================================== */
+
+template <typename T, typename Functor>
+__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout,
+                                        T* dx, int num, Functor functor) {
+  using VecType = typename CudaVecType<T>::type;
+  constexpr int vecsize = CudaVecType<T>::vecsize;
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  int loop = num / vecsize;
+  int tail = num % vecsize;
+  const VecType* in_forward = reinterpret_cast<const VecType*>(forward_data);
+  const VecType* in_dout = reinterpret_cast<const VecType*>(dout);
+  VecType* out = reinterpret_cast<VecType*>(dx);
+
+  for (int i = idx; i < loop; i += stride) {
+    out[i] = functor.Compute((in_forward + i), (in_dout + i));
+  }
+
+  while (idx == loop && tail) {
+    dx[num - tail] =
+        functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]);
+    --tail;
+  }
+}
+
+template <typename T, typename Functor>
+__global__ void ActivationkernelVec(const T* src, T* dst, int num,
+                                    Functor functor) {
+  constexpr int vecsize = CudaVecType<T>::vecsize;
+  using VecType = typename CudaVecType<T>::type;
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  int loop = num / vecsize;
+  int tail = num % vecsize;
+  const VecType* in = reinterpret_cast<const VecType*>(src);
+  VecType* out = reinterpret_cast<VecType*>(dst);
+
+  for (int i = idx; i < loop; i += stride) {
+    out[i] = functor.Compute((in + i));
+  }
+
+  while (idx == loop && tail) {
+    dst[num - tail] = functor.ComputeRemainder(src[num - tail]);
+    --tail;
+  }
+}
+
+template <typename DeviceContext, typename Functor>
+class ActivationGPUKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = nullptr;
+    framework::Tensor* out = nullptr;
+    ExtractActivationTensor(context, &in_x, &out);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    int num = in_x->numel();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>(dev_ctx.GetPlace(),
+                                          static_cast<size_t>(num * sizeof(T)));
+
+    int block = 512;
+#ifdef __HIPCC__
+    block = 256;
+#endif
+    Functor functor;
+    constexpr int vecsize = CudaVecType<T>::vecsize;
+    int grid = max((num / vecsize + block - 1) / block, 1);
+    ActivationkernelVec<T, Functor><<<grid, block>>>(input_data, output_data,
+                                                     num, functor);
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class ActivationGradGPUKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor *x, *out, *d_out;
+    framework::Tensor* d_x = nullptr;
+    x = out = d_out = nullptr;
+    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &x, &out, &d_out,
+                                                    &d_x);
+    int numel = d_out->numel();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto* dx_data = d_x->mutable_data<T>(
+        dev_ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
+    auto* dout_data = d_out->data<T>();
+
+    auto* forward_data = dout_data;
+    if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
+      // Only need forward output Out
+      forward_data = out->data<T>();
+    } else if (static_cast<int>(Functor::FwdDeps()) ==
+               static_cast<int>(kDepX)) {
+      // Only need forward input X
+      forward_data = x->data<T>();
+    }
+
+    int block = 512;
+#ifdef __HIPCC__
+    block = 256;
+#endif
+    Functor functor;
+    constexpr int vecsize = CudaVecType<T>::vecsize;
+    int grid = max((numel / vecsize + block - 1) / block, 1);
+    ActivationGradKernelVec<T, Functor><<<grid, block>>>(
+        forward_data, dout_data, dx_data, numel, functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
@@ -60,7 +328,21 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluCUDAFunctor, ReluGradFunctor);
+REGISTER_OP_CUDA_KERNEL(
+    relu, ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
+                                   ops::ReluGPUFuctor<float>>,
+    ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
+                             ops::ReluGPUFuctor<double>>,
+    ops::ActivationGPUKernel<plat::CUDADeviceContext,
+                             ops::ReluGPUFuctor<plat::float16>>);
+
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad, ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
+                                            ops::ReluGradGPUFunctor<float>>,
+    ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
+                                 ops::ReluGradGPUFunctor<double>>,
+    ops::ActivationGradGPUKernel<plat::CUDADeviceContext,
+                                 ops::ReluGradGPUFunctor<plat::float16>>);
 
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,
-- 
Gitee


From de132b0463104eed55ba324756f568105c238ecc Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:34:14 +0800
Subject: [PATCH 40/45] !49 trt plugin upgrade to pluginv2ext Merge pull
 request !49 from PaddlePaddle-Gardener/trt_plugin_v2

---
 .../inference/tensorrt/convert/split_op.cc    |   2 +-
 paddle/fluid/inference/tensorrt/engine.cc     |   9 +-
 paddle/fluid/inference/tensorrt/engine.h      |   7 ++
 .../inference/tensorrt/plugin/CMakeLists.txt  |   3 +
 .../tensorrt/plugin/split_op_plugin.cu        |   5 -
 .../tensorrt/plugin/split_op_plugin.h         |  69 +++++++++--
 .../tensorrt/plugin/test_split_plugin.cc      |  58 +++++++++
 .../inference/tensorrt/plugin/trt_plugin.cc   |  78 ++++++++++--
 .../inference/tensorrt/plugin/trt_plugin.h    | 112 +++++++++++++++++-
 python/setup.py.in                            |  11 ++
 10 files changed, 322 insertions(+), 32 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc

diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 768c6efaa6..5d494c2093 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -101,7 +101,7 @@ class SplitOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SplitPlugin* plugin =
           new plugin::SplitPlugin(axis, output_lengths, with_fp16);
-      layer = engine_->AddPlugin(&input, input_num, plugin);
+      layer = engine_->AddPluginV2Ext(&input, input_num, plugin);
     }
 
     std::string layer_name = "split (Output: ";
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 0bba4581ff..99549fd6b5 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <string>
 
-#include "cuda_runtime_api.h"
+#include "cuda_runtime_api.h"  // NOLINT
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -353,6 +353,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
   return network()->addPluginExt(inputs, num_inputs, *plugin);
 }
 
+nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2Ext(
+    nvinfer1::ITensor *const *inputs, int num_inputs,
+    plugin::PluginTensorRTV2Ext *plugin) {
+  owned_plugin_v2ext_.emplace_back(plugin);
+  return network()->addPluginV2(inputs, num_inputs, *plugin);
+}
+
 void TensorRTEngine::freshDeviceId() {
   int count;
   cudaGetDeviceCount(&count);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 0e399578fa..de2924824f 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -305,8 +305,14 @@ class TensorRTEngine {
   }
 
   int GetDeviceId() { return device_id_; }
+
   nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                     int num_inputs, plugin::PluginTensorRT*);
+
+  nvinfer1::IPluginV2Layer* AddPluginV2Ext(nvinfer1::ITensor* const* inputs,
+                                           int num_inputs,
+                                           plugin::PluginTensorRTV2Ext* plugin);
+
   void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) {
     quant_dynamic_range_[tensor] = range;
   }
@@ -414,6 +420,7 @@ class TensorRTEngine {
       itensor_map_;
 
   std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
+  std::vector<std::unique_ptr<plugin::PluginTensorRTV2Ext>> owned_plugin_v2ext_;
 
   // TensorRT related internal members
   template <typename T>
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index e37beb3b8e..7ee16a598d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -6,3 +6,6 @@ nv_library(tensorrt_plugin
            qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
            hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
+
+nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
+  paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin)
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 256aa28206..1b5c39f8ff 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -22,11 +22,6 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) {
-  return new SplitPlugin(buffer, length);
-}
-REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize);
-
 template <typename T>
 __device__ int upper_bound(T const* vals, int n, T const& key) {
   int i = 0;
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 5c47ec3a99..e43b57357f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -25,7 +25,7 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-class SplitPlugin : public PluginTensorRT {
+class SplitPlugin : public PluginTensorRTV2Ext {
  public:
   SplitPlugin() {}
   SplitPlugin(int axis, std::vector<int> const& output_lengths, bool with_fp16)
@@ -39,13 +39,20 @@ class SplitPlugin : public PluginTensorRT {
     DeserializeValue(&serial_data, &serial_length, &output_length_);
   }
 
-  SplitPlugin* clone() const override {
-    auto* ptr = new SplitPlugin(axis_, output_length_, with_fp16_);
+  nvinfer1::IPluginV2Ext* clone() const override {
+    SplitPlugin* ptr = new SplitPlugin(axis_, output_length_, with_fp16_);
+    ptr->setPluginNamespace(this->getPluginNamespace());
     ptr->shareData(this);
     return ptr;
   }
 
-  const char* getPluginType() const override { return "split_plugin"; }
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override {
+    return input_types[0];
+  }
+
+  const char* getPluginType() const override { return "split_plugin_v2ext"; }
   int getNbOutputs() const override { return output_length_.size(); }
   nvinfer1::Dims getOutputDimensions(int index,
                                      const nvinfer1::Dims* input_dims,
@@ -53,17 +60,18 @@ class SplitPlugin : public PluginTensorRT {
 
   int initialize() override;
   void terminate() override;
-  int enqueue(int batchSize, const void* const* inputs, void** outputs,
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
               void* workspace, cudaStream_t stream) override;
 
+  void destroy() override { delete this; }
+
  protected:
-  size_t getSerializationSize() override {
-    return SerializedSize(getPluginType()) + SerializedSize(axis_) +
-           SerializedSize(output_length_) + getBaseSerializationSize();
+  size_t getSerializationSize() const override {
+    return SerializedSize(axis_) + SerializedSize(output_length_) +
+           getBaseSerializationSize();
   }
 
-  void serialize(void* buffer) override {
-    SerializeValue(&buffer, getPluginType());
+  void serialize(void* buffer) const override {
     serializeBase(buffer);
     SerializeValue(&buffer, axis_);
     SerializeValue(&buffer, output_length_);
@@ -83,6 +91,47 @@ class SplitPlugin : public PluginTensorRT {
   void shareData(const SplitPlugin* another);
 };
 
+class SplitPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  SplitPluginCreator() {}
+  const char* getPluginName() const override { return "split_plugin_v2ext"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    // not implemented
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new SplitPlugin(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(SplitPluginCreator);
+
 #if IS_TRT_VERSION_GE(6000)
 class SplitPluginDynamic : public DynamicPluginTensorRT {
  public:
diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
new file mode 100644
index 0000000000..6636513a55
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+TEST(split_op_plugin, test_plugin) {
+  int axis = 1;
+  std::vector<int> output_lengths{1, 1};
+  bool with_fp16 = false;
+  std::vector<nvinfer1::DataType> input_types{nvinfer1::DataType::kFLOAT};
+  std::vector<nvinfer1::Dims> input_dims;
+
+  SplitPlugin sp_plugin(axis, output_lengths, with_fp16);
+  nvinfer1::Dims in_dims;
+  in_dims.nbDims = 4;
+  input_dims.push_back(in_dims);
+  sp_plugin.configurePlugin(input_dims.data(), 1, nullptr, 2,
+                            input_types.data(), nullptr, nullptr, nullptr,
+                            nvinfer1::PluginFormat::kNCHW, 4);
+  sp_plugin.initialize();
+  sp_plugin.getPluginType();
+  sp_plugin.canBroadcastInputAcrossBatch(0);
+  sp_plugin.getNbOutputs();
+  auto clone_plugin = sp_plugin.clone();
+  clone_plugin->setPluginNamespace("test");
+  clone_plugin->destroy();
+  sp_plugin.getOutputDataType(0, input_types.data(), 1);
+  sp_plugin.terminate();
+}
+
+TEST(split_op_plugin, test_plugin_creater) {
+  SplitPluginCreator creator;
+  creator.getFieldNames();
+  creator.createPlugin("test", nullptr);
+  creator.setPluginNamespace("test");
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
index fd721b1614..55bc786746 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -19,27 +19,50 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+inline void Seria(void*& buffer,  // NOLINT
+                  const std::vector<nvinfer1::Dims>& input_dims,
+                  size_t max_batch_size, nvinfer1::DataType data_type,
+                  nvinfer1::PluginFormat data_format, bool with_fp16) {
+  SerializeValue(&buffer, input_dims);
+  SerializeValue(&buffer, max_batch_size);
+  SerializeValue(&buffer, data_type);
+  SerializeValue(&buffer, data_format);
+  SerializeValue(&buffer, with_fp16);
+}
+
+inline void Deseria(void const*& serial_data, size_t& serial_length,  // NOLINT
+                    std::vector<nvinfer1::Dims>* input_dims,
+                    size_t* max_batch_size, nvinfer1::DataType* data_type,
+                    nvinfer1::PluginFormat* data_format, bool* with_fp16) {
+  DeserializeValue(&serial_data, &serial_length, input_dims);
+  DeserializeValue(&serial_data, &serial_length, max_batch_size);
+  DeserializeValue(&serial_data, &serial_length, data_type);
+  DeserializeValue(&serial_data, &serial_length, data_format);
+  DeserializeValue(&serial_data, &serial_length, with_fp16);
+}
+
+inline size_t SeriaSize(const std::vector<nvinfer1::Dims>& input_dims,
+                        size_t max_batch_size, nvinfer1::DataType data_type,
+                        nvinfer1::PluginFormat data_format, bool with_fp16) {
+  return (SerializedSize(input_dims) + SerializedSize(max_batch_size) +
+          SerializedSize(data_type) + SerializedSize(data_format) +
+          SerializedSize(with_fp16));
+}
+
 void PluginTensorRT::serializeBase(void*& buffer) {
-  SerializeValue(&buffer, input_dims_);
-  SerializeValue(&buffer, max_batch_size_);
-  SerializeValue(&buffer, data_type_);
-  SerializeValue(&buffer, data_format_);
-  SerializeValue(&buffer, with_fp16_);
+  Seria(buffer, input_dims_, max_batch_size_, data_type_, data_format_,
+        with_fp16_);
 }
 
 void PluginTensorRT::deserializeBase(void const*& serial_data,
                                      size_t& serial_length) {
-  DeserializeValue(&serial_data, &serial_length, &input_dims_);
-  DeserializeValue(&serial_data, &serial_length, &max_batch_size_);
-  DeserializeValue(&serial_data, &serial_length, &data_type_);
-  DeserializeValue(&serial_data, &serial_length, &data_format_);
-  DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+  Deseria(serial_data, serial_length, &input_dims_, &max_batch_size_,
+          &data_type_, &data_format_, &with_fp16_);
 }
 
 size_t PluginTensorRT::getBaseSerializationSize() {
-  return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) +
-          SerializedSize(data_type_) + SerializedSize(data_format_) +
-          SerializedSize(with_fp16_));
+  return SeriaSize(input_dims_, max_batch_size_, data_type_, data_format_,
+                   with_fp16_);
 }
 
 bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
@@ -58,6 +81,35 @@ void PluginTensorRT::configureWithFormat(
   max_batch_size_ = max_batch_size;
 }
 
+void PluginTensorRTV2Ext::serializeBase(void*& buffer) const {
+  Seria(buffer, input_dims_, max_batch_size_, data_type_, data_format_,
+        with_fp16_);
+}
+
+void PluginTensorRTV2Ext::deserializeBase(void const*& serial_data,
+                                          size_t& serial_length) {
+  Deseria(serial_data, serial_length, &input_dims_, &max_batch_size_,
+          &data_type_, &data_format_, &with_fp16_);
+}
+
+size_t PluginTensorRTV2Ext::getBaseSerializationSize() const {
+  return SeriaSize(input_dims_, max_batch_size_, data_type_, data_format_,
+                   with_fp16_);
+}
+
+void PluginTensorRTV2Ext::configurePlugin(
+    const nvinfer1::Dims* input_dims, int32_t nb_inputs,
+    const nvinfer1::Dims* output_dims, int32_t nb_outputs,
+    const nvinfer1::DataType* input_types,
+    const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
+    const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
+    int32_t max_batch_size) {
+  input_dims_.assign(input_dims, input_dims + nb_inputs);
+  max_batch_size_ = max_batch_size;
+  data_format_ = float_format;
+  data_type_ = input_types[0];
+}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index b3a3abe5d0..ce3133ae99 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -44,6 +44,7 @@ typedef std::function<PluginTensorRT*(const void*, size_t)>
 
 typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
 
+// Deprecated. Do not inherit this class, please refer to PluginTensorRTV2Ext
 class PluginTensorRT : public nvinfer1::IPluginExt {
  public:
   PluginTensorRT() : with_fp16_(false) {}
@@ -119,6 +120,114 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
   bool with_fp16_;
 };
 
+// TensorRT introduced IPluginV2Ext after 5.1, Paddle no longer supports
+// versions before 5.1
+class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext {
+ public:
+  PluginTensorRTV2Ext() : with_fp16_(false) {}
+  PluginTensorRTV2Ext(const void* serialized_data, size_t length) {}
+
+  nvinfer1::Dims const& getInputDims(int index) const {
+    return input_dims_.at(index);
+  }
+  size_t getMaxBatchSize() const { return max_batch_size_; }
+  nvinfer1::DataType getDataType() const { return data_type_; }
+  nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
+
+  // The Func in IPluginV2Ext
+  virtual nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const = 0;
+
+  virtual bool isOutputBroadcastAcrossBatch(int32_t output_index,
+                                            const bool* input_is_broadcasted,
+                                            int32_t nb_inputs) const {
+    return false;
+  }
+
+  virtual bool canBroadcastInputAcrossBatch(int32_t input_index) const {
+    return false;
+  }
+
+  void configurePlugin(const nvinfer1::Dims* input_dims, int32_t nb_inputs,
+                       const nvinfer1::Dims* output_dims, int32_t nb_outputs,
+                       const nvinfer1::DataType* input_types,
+                       const nvinfer1::DataType* output_types,
+                       const bool* input_is_broadcast,
+                       const bool* output_is_broadcast,
+                       nvinfer1::PluginFormat float_format,
+                       int32_t max_batch_size) override;
+
+  virtual IPluginV2Ext* clone() const = 0;
+
+  void attachToContext(cudnnContext*, cublasContext*,
+                       nvinfer1::IGpuAllocator*) override {}
+
+  void detachFromContext() override {}
+
+  // The Func in IPluginV2
+  virtual const char* getPluginType() const = 0;
+  const char* getPluginVersion() const override { return "1"; }
+  virtual int32_t getNbOutputs() const { return 1; }
+  virtual nvinfer1::Dims getOutputDimensions(int32_t index,
+                                             const nvinfer1::Dims* inputs,
+                                             int32_t nb_input) = 0;
+  // Check format support. The default is FLOAT32 and NCHW.
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const override {
+    return ((type == nvinfer1::DataType::kFLOAT) &&
+            (format == nvinfer1::PluginFormat::kNCHW));
+  }
+  // Initialize the layer for execution.
+  // This is called when the engine is created.
+  int initialize() override { return 0; }
+
+  // Shutdown the layer. This is called when the engine is destroyed
+  void terminate() override {}
+
+  // Find the workspace size required by the layer
+  size_t getWorkspaceSize(int) const override { return 0; }
+
+  // Execute the layer
+  virtual int enqueue(int batch_size, const void* const* inputs, void** outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+
+  // Find the size of the serialization buffer required
+  virtual size_t getSerializationSize() const = 0;
+
+  // Serialize the layer config to buffer.
+  // TensorRT will call this func to serialize the configuration of TensorRT
+  // engine. It should not be called by users.
+  virtual void serialize(void* buffer) const = 0;
+
+  virtual void destroy() = 0;
+
+  void setPluginNamespace(const char* plugin_namespace) override {
+    name_space_ = plugin_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return name_space_.c_str();
+  }
+
+ protected:
+  void deserializeBase(void const*& serial_data,  // NOLINT
+                       size_t& serial_length);    // NOLINT
+  size_t getBaseSerializationSize() const;
+  void serializeBase(void*& buffer) const;  // NOLINT
+
+ protected:
+  std::vector<nvinfer1::Dims> input_dims_;
+  size_t max_batch_size_;
+  nvinfer1::DataType data_type_;
+  nvinfer1::PluginFormat data_format_;
+  std::vector<nvinfer1::ITensor*> inputs_;
+  bool with_fp16_;
+
+ private:
+  std::string name_space_;
+};
+
 #if IS_TRT_VERSION_GE(6000)
 class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
  public:
@@ -184,6 +293,7 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
   std::string name_space_;
   std::string plugin_base_;
 };
+#endif
 
 template <typename T>
 class TrtPluginRegistrarV2 {
@@ -203,8 +313,6 @@ class TrtPluginRegistrarV2 {
   static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
       plugin_registrar_##name {}
 
-#endif
-
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/python/setup.py.in b/python/setup.py.in
index 64cfe6e9cc..69a8bc771a 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -336,6 +336,17 @@ if '${WITH_XPU_BKCL}' == 'ON':
     shutil.copy('${XPU_BKCL_LIB}', libs_path)
     package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}']
 
+# Only for lite xpu inference.
+if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '':
+    xpu_api_lib = os.path.join('${XPU_SDK_ROOT}', 'XTDK/shlib/', 'libxpuapi.so')
+    xpu_rt_lib = os.path.join('${XPU_SDK_ROOT}', 'XTDK/runtime/shlib/', 'libxpurt.so')
+    if os.path.exists(xpu_api_lib):
+        shutil.copy(xpu_api_lib, libs_path)
+        package_data['paddle.libs']+=['libxpuapi.so']
+    if os.path.exists(xpu_rt_lib):
+        shutil.copy(xpu_rt_lib, libs_path)
+        package_data['paddle.libs']+=['libxpurt.so']
+
 ### Old custom op extension mechanism related, will be removed in 2.1.0 ###
 # copy libpaddle_framework.so to libs on linux
 if sys.platform.startswith('linux'):
-- 
Gitee


From fe1bf173ab156be2b6aa53ef0a5a38d3e9ed8efb Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:34:15 +0800
Subject: [PATCH 41/45] !50 Update windows compiler and CI from VS2015 to
 VS2017 Merge pull request !50 from PaddlePaddle-Gardener/VS2017

---
 CMakeLists.txt                                |  4 +-
 cmake/external/warpctc.cmake                  |  2 +-
 cmake/generic.cmake                           | 10 ++--
 cmake/init.cmake                              |  4 ++
 cmake/paddle_win.props                        |  2 +-
 paddle/fluid/inference/api/demo_ci/run.sh     |  4 +-
 .../api/demo_ci/run_windows_demo.bat          |  8 +--
 .../api/demo_ci/windows_inference.md          |  2 +-
 paddle/fluid/pybind/CMakeLists.txt            |  2 +-
 paddle/scripts/paddle_build.bat               | 49 ++++++++++++-------
 paddle/scripts/windows_build/build.bat        | 12 ++---
 paddle/scripts/windows_build/config.ini       |  2 +-
 python/CMakeLists.txt                         |  1 +
 13 files changed, 59 insertions(+), 43 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10b3b0aba4..676c94591e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,8 @@ if(WITH_MUSL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()
 
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zm1000 /fp:fast")
+
 if(WIN32)
     option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
 
@@ -124,7 +126,7 @@ if(WIN32)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
 
     foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
-        string(APPEND ${flag_var} "/ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+        set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
     endforeach(flag_var)
 
     if (WITH_WIN_DUMP_DBG)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index e633cae540..b0ef575f64 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)
+set(WARPCTC_TAG         cd828e5b6c3b953b82af73f7f44cddc393a20efa)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index ba86cfabdf..c85654a567 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -492,10 +492,8 @@ function(nv_library TARGET_NAME)
         message(FATAL "Please specify source file or library in nv_library.")
       endif()
     endif(nv_library_SRCS)
-    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
-      if(${MSVC_VERSION} LESS_EQUAL 1900)
-        set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
-      endif()
+    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
     endif()
   endif()
 endfunction(nv_library)
@@ -512,7 +510,7 @@ function(nv_binary TARGET_NAME)
       add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
       common_link(${TARGET_NAME})
     endif()
-    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
       set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
     endif()
   endif()
@@ -539,7 +537,7 @@ function(nv_test TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
       set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
     endif()
   endif()
diff --git a/cmake/init.cmake b/cmake/init.cmake
index aea0208875..19fdb6c601 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -18,6 +18,10 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 else()
+    # It has not been used now, it can specify CUDA compile flag manualy,
+    # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
+    # because CUDA will update by nvidia, then error will occur.
+    # Now, it's used in CUDA:[10.0, 10.2]
     set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()
 
diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props
index 0115ad4b59..296940dc3f 100644
--- a/cmake/paddle_win.props
+++ b/cmake/paddle_win.props
@@ -15,7 +15,7 @@
             <Warning>InheritFromHost</Warning>
 
             <BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
-            <BuildCommandLineTemplate>--use-local-env --cl-version $(CudaClVersion)</BuildCommandLineTemplate>
+            <BuildCommandLineTemplate>--use-local-env $(CudaClVersion)</BuildCommandLineTemplate>
             <BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
             <CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
             <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index e11a5b9c33..53f9259666 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -88,7 +88,7 @@ for WITH_STATIC_LIB in ON OFF; do
       return 0
     fi
     # -----simple_on_word2vec on windows-----
-    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+    cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=simple_on_word2vec \
       -DWITH_GPU=$TEST_GPU_CPU \
@@ -107,7 +107,7 @@ for WITH_STATIC_LIB in ON OFF; do
 
     # -----vis_demo on windows-----
     rm -rf *
-    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+    cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=vis_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
diff --git a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
index 523dafa664..d17f516fcc 100644
--- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
+++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
@@ -67,7 +67,7 @@ if /i "%use_gpu%"=="Y" (
 
 rem set_path_vs_command_prompt 
 :set_vcvarsall_dir
-SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat   =======>"
+SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat   =======>"
 set tmp_var=!vcvarsall_dir!
 call:remove_space
 set vcvarsall_dir=!tmp_var!   
@@ -177,16 +177,16 @@ if /i "%use_mkl%"=="N" (
 
 if /i "%gpu_inference%"=="Y" (
     if  "%demo_name%"=="trt_mobilenet_demo" (
-      cmake .. -G "Visual Studio 14 2015 Win64"  -T host=x64 -DWITH_GPU=ON ^
+      cmake .. -G "Visual Studio 15 2017 Win64"  -T host=x64 -DWITH_GPU=ON ^
       -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^
       -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON -DCUDA_LIB="%cuda_lib_dir%" -DUSE_TENSORRT=ON
     ) else (
-      cmake .. -G "Visual Studio 14 2015 Win64"  -T host=x64 -DWITH_GPU=ON ^
+      cmake .. -G "Visual Studio 15 2017 Win64"  -T host=x64 -DWITH_GPU=ON ^
       -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^
       -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON -DCUDA_LIB="%cuda_lib_dir%"
     )
 ) else (
-    cmake .. -G "Visual Studio 14 2015 Win64"  -T host=x64 -DWITH_GPU=OFF ^
+    cmake .. -G "Visual Studio 15 2017 Win64"  -T host=x64 -DWITH_GPU=OFF ^
     -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^
     -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON
 )
diff --git a/paddle/fluid/inference/api/demo_ci/windows_inference.md b/paddle/fluid/inference/api/demo_ci/windows_inference.md
index 73938cb995..c646c35146 100644
--- a/paddle/fluid/inference/api/demo_ci/windows_inference.md
+++ b/paddle/fluid/inference/api/demo_ci/windows_inference.md
@@ -8,7 +8,7 @@
 3. 进入Paddle/paddle/fluid/inference/api/demo_ci目录，新建build目录，然后使用cmake生成vs2015的solution文件。
 其中PADDLE_LIB是前面的paddle_inference.lib对应文件夹, CUDA_LIB指定为x64格式下的cuda系统库目录文件夹。
 ```shell
- cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_inference.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64
+ cmake .. -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_inference.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64
 ```
 然后用vs2015打开对应的项目文件，注意使用静态链接 "/MT"，生成对应的exe。将openblas.dll放到exe所在目录。
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 5452b2160a..5c9655edfb 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -117,7 +117,7 @@ if(WITH_PYTHON)
     "${op_function_generator_path}/op_function_generator ${impl_file}\n"
     "if %ERRORLEVEL% NEQ 0 (\n"
     "    set /a build_times=%build_times%+1\n"
-    "    if %build_times% GTR 100 (\n"
+    "    if %build_times% GTR 5 (\n"
     "        exit /b 1\n"
     "    ) else (\n"
     "        goto :retry\n"
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 07de8ff6c2..c5bb7ea472 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -30,8 +30,13 @@ taskkill /f /im op_function_generator.exe
 wmic process where name="op_function_generator.exe" call terminate
 taskkill /f /im python.exe  2>NUL
 
+:: TODO: Temporarily，REMOVE after VS2017 is stable.
+set WITH_TPCACHE=OFF
+rmdir %cache_dir%\third_party_GPU /s/q
+rmdir %cache_dir%\third_party /s/q
+
 rem ------initialize common variable------
-if not defined GENERATOR set GENERATOR="Visual Studio 14 2015 Win64"
+if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
 if not defined BRANCH set BRANCH=develop
 if not defined WITH_TENSORRT set WITH_TENSORRT=ON 
 if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT
@@ -157,9 +162,11 @@ if %GENERATOR% == "Ninja" (
 
 rem ------show summary of current environment----------
 cmake --version
-nvcc --version
-where nvidia-smi
-nvidia-smi
+if "%WITH_GPU%"=="ON" (
+    nvcc --version
+    where nvidia-smi
+    nvidia-smi
+)
 python %work_dir%\tools\summary_env.py
 %cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
 
@@ -241,7 +248,9 @@ echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
 
-call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
+rem Configure the environment for 64-bit builds. 'DISTUTILS_USE_SDK' indicates that the user has selected the compiler.
+call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
+set DISTUTILS_USE_SDK=1
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
@@ -261,16 +270,16 @@ if %day_now% NEQ %day_before% (
     echo %day_now% > %cache_dir%\day.txt
     type %cache_dir%\day.txt
     if %day_now% EQU 21 (
-        rmdir %cache_dir%\third_party_GPU/ /s/q
-        rmdir %cache_dir%\third_party/ /s/q
+        rmdir %cache_dir%\third_party_GPU /s/q
+        rmdir %cache_dir%\third_party /s/q
     )
     if %day_now% EQU 11 (
-        rmdir %cache_dir%\third_party_GPU/ /s/q
-        rmdir %cache_dir%\third_party/ /s/q
+        rmdir %cache_dir%\third_party_GPU /s/q
+        rmdir %cache_dir%\third_party /s/q
     )
     if %day_now% EQU 01 (
-        rmdir %cache_dir%\third_party_GPU/ /s/q
-        rmdir %cache_dir%\third_party/ /s/q
+        rmdir %cache_dir%\third_party_GPU /s/q
+        rmdir %cache_dir%\third_party /s/q
     )
 )
 
@@ -294,14 +303,14 @@ if "%WITH_GPU%"=="ON" (
 )
 
 :cmake_impl
-echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+echo cmake .. -G %GENERATOR% -T host=x64 -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
 -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME%
 
-cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
@@ -322,14 +331,16 @@ echo    ========================================
 echo    Step 2. Buile Paddle ...
 echo    ========================================
 
-for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*2/3
+for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*4/5
+echo "PARALLEL PROJECT COUNT is %PARALLEL_PROJECT_COUNT%"
 set build_times=1
 :build_tp
 echo Build third_party the %build_times% time:
+
 if %GENERATOR% == "Ninja" (
     ninja third_party
 ) else (
-    msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
+    MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:quiet third_party.vcxproj
 )
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
@@ -352,9 +363,9 @@ if %GENERATOR% == "Ninja" (
     ninja -j %PARALLEL_PROJECT_COUNT%
 ) else (
     if "%WITH_CLCACHE%"=="OFF" (
-        msbuild /m:%PARALLEL_PROJECT_COUNT% /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln
+        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     ) else (
-        msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln
+        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln
     )
 )
 
@@ -579,7 +590,7 @@ echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_
 echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
 echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G %GENERATOR% -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+echo cmake .. -G %GENERATOR% -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
@@ -696,7 +707,7 @@ echo    ========================================
 echo    Clean up environment  at the end ...
 echo    ========================================
 taskkill /f /im cmake.exe  2>NUL
-taskkill /f /im msbuild.exe 2>NUL
+taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im git.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL
 taskkill /f /im lib.exe 2>NUL
diff --git a/paddle/scripts/windows_build/build.bat b/paddle/scripts/windows_build/build.bat
index 6f99c23ccd..9a2ed349e5 100644
--- a/paddle/scripts/windows_build/build.bat
+++ b/paddle/scripts/windows_build/build.bat
@@ -61,8 +61,8 @@ echo Current directory : %cd%
 
 call:rest_env
 
-echo cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
-cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
+echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
+cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
 
 set  MSBUILDDISABLENODEREUSE=1
 
@@ -82,8 +82,8 @@ echo Current directory : %cd%
 
 call:rest_env
 
-echo cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
-cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
+echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
+cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
 
 set  MSBUILDDISABLENODEREUSE=1
 
@@ -107,8 +107,8 @@ echo Current directory : %cd%
 
 call:rest_env
 
-echo cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
-cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON  -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR%  -DCUDA_ARCH_NAME=All
+echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
+cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON  -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR%  -DCUDA_ARCH_NAME=All
 
 set  MSBUILDDISABLENODEREUSE=1
 
diff --git a/paddle/scripts/windows_build/config.ini b/paddle/scripts/windows_build/config.ini
index 32638d2873..750d7af8c2 100644
--- a/paddle/scripts/windows_build/config.ini
+++ b/paddle/scripts/windows_build/config.ini
@@ -11,7 +11,7 @@ http_proxy=#please edit your proxy#
 https_proxy=#please edit your proxy#
 
 # Just for example, please set by your windows environment
-vcvarsall_dir="C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat"
+vcvarsall_dir="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat"
 PYTHON3_PATH=C:\Python37
 
 CUDA_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0"
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index e0e845601c..938547f363 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -77,6 +77,7 @@ IF(WIN32)
   add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
     COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMENT "Packing whl packages------>>>"
     DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ELSE(WIN32)
   add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-- 
Gitee


From 47c72a7362e9a05dd5b7f614ec196ce5a84f34e9 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:34:15 +0800
Subject: [PATCH 42/45] !51 Fix the problem when rnns compatible with cudnn's
 parameters are created again Merge pull request !51 from
 PaddlePaddle-Gardener/fix_rnn

---
 python/paddle/fluid/framework.py              |  6 ++-
 .../rnn/test_rnn_cudnn_params_packing.py      | 53 +++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 036e8ab304..db487128bb 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -3031,7 +3031,11 @@ class Block(object):
                         # In startup_program, "c_broadcast" and "c_sync_comm_stream"
                         # are treated as initialization ops that cause error.
                         # Think of "c_broadcast" and "c_sync_comm_stream" as a special case here.
-                        if op.type in ["c_broadcast", "c_sync_comm_stream"]:
+                        # NOTE: "coalesce_tensor" is a special case for rnn with cudnn support
+                        if op.type in [
+                                "c_broadcast", "c_sync_comm_stream",
+                                "coalesce_tensor"
+                        ]:
                             continue
                         init_ops.append(op)
                 return init_ops
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py
new file mode 100644
index 0000000000..0712d5be23
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from unittest import TestCase
+
+
+def create_model():
+    hidden_size = 32
+    bilstm = paddle.nn.LSTM(
+        hidden_size, hidden_size, num_layers=1, direction='bidirectional')
+    return bilstm
+
+
+class TestRNNProgramClone(TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+    def test_rnn_with_cudnn_clone(self):
+        train_program = paddle.static.Program()
+        test_program = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+
+        # test a typical case in static graph usage: create two nearly
+        # identical program with a shared startup program to share their
+        # parameters
+        # 
+        # when creating a parameter, the name is checked. If there is already
+        # a parameter with the same name, which is the output of a operator
+        # (i.e. its creator), its re-creation is skipped.
+        # 
+        # but if that parameter has been the output of more than one operator,
+        # an exception is raised. For special cases, white list is added.
+        # flattening rnn's parameters for the need to call cudnn kernel is such 
+        # a case.
+        with paddle.static.program_guard(train_program, startup_prog):
+            with paddle.fluid.unique_name.guard():
+                bilstm = create_model()
+
+        with paddle.fluid.program_guard(test_program, startup_prog):
+            with paddle.fluid.unique_name.guard():
+                bilstm = create_model()
-- 
Gitee


From 89e978fb0a48d9004bad35b3c1609125c82bfd29 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:34:16 +0800
Subject: [PATCH 43/45] !52 Fix bfloat16 compile failed in gcc5.4 Merge pull
 request !52 from
 PaddlePaddle-Gardener/extension/fix_bloat16_gcc5.4_complie_faiiled

---
 paddle/fluid/platform/bfloat16.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
index d1257f853e..6cb4901f1d 100644
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@@ -16,6 +16,7 @@
 
 #include <stdint.h>
 
+#include <cmath>
 #include <cstring>
 #include <iostream>
 #include <limits>
-- 
Gitee


From ef0cdfd9caf2750a94d98814be03f0f1017245a6 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 29 Mar 2021 11:37:36 +0800
Subject: [PATCH 44/45] mirgate_31829

---
 python/paddle/nn/layer/pooling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 0f3c4449a3..5830af3a18 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -589,8 +589,8 @@ class MaxPool3D(layers.Layer):
 
     def __init__(self,
                  kernel_size,
-                 stride,
-                 padding,
+                 stride=None,
+                 padding=0,
                  return_mask=False,
                  ceil_mode=False,
                  data_format="NCDHW",
-- 
Gitee


From 4eeed669943f141376ff7da92b429ccc34bdba5f Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Mon, 12 Apr 2021 14:04:23 +0800
Subject: [PATCH 45/45] mirgate_31832

---
 CMakeLists.txt                                |  11 +-
 cmake/cuda.cmake                              |  11 +-
 cmake/cudnn.cmake                             |   2 +-
 cmake/external/eigen.cmake                    |  48 +-
 cmake/external/warpctc.cmake                  |   9 +
 cmake/external/xpu.cmake                      |   2 +-
 cmake/flags.cmake                             |  22 +-
 go/README_cn.md                               |   1 +
 go/demo/mobilenet.go                          |   2 +-
 go/paddle/common.go                           |   2 +-
 go/paddle/config.go                           |   2 +-
 go/paddle/predictor.go                        |   4 +-
 go/paddle/tensor.go                           |   4 +-
 paddle/fluid/extension/include/ext_tensor.h   |   3 +
 paddle/fluid/extension/src/ext_tensor.cc      |  21 +-
 paddle/fluid/framework/CMakeLists.txt         |  37 +-
 paddle/fluid/framework/c/c_api.cc             |  53 --
 paddle/fluid/framework/c/c_api.h              |  55 --
 paddle/fluid/framework/custom_operator.cc     |   1 -
 paddle/fluid/framework/custom_tensor_utils.h  |   2 +-
 paddle/fluid/framework/executor_gc_helper.cc  |   1 -
 .../embedding_eltwise_layernorm_fuse_pass.cc  |  19 +-
 .../framework/ir/graph_pattern_detector.cc    |  30 +
 .../framework/ir/graph_pattern_detector.h     |   6 +-
 .../framework/ir/layer_norm_fuse_pass.cc      |   1 -
 .../ir/layer_norm_fuse_pass_tester.cc         |   1 -
 .../framework/ir/map_matmul_to_mul_pass.cc    |   4 +-
 .../ir/multihead_matmul_fuse_pass.cc          | 468 +++++++++
 .../framework/ir/multihead_matmul_fuse_pass.h |  66 +-
 paddle/fluid/framework/load_op_lib.h          | 120 ---
 paddle/fluid/framework/op_info.h              |   1 -
 paddle/fluid/framework/op_proto_maker.h       |   1 -
 paddle/fluid/framework/op_version_registry.h  |   1 -
 paddle/fluid/framework/operator.h             |   1 -
 paddle/fluid/framework/program_desc.h         |   1 -
 paddle/fluid/framework/reader.h               |   1 -
 paddle/fluid/framework/section_worker.cc      |  16 +
 paddle/fluid/framework/tensor_util.h          |   1 -
 paddle/fluid/framework/var_type.h             |   1 -
 paddle/fluid/framework/var_type_traits.h      |   1 -
 paddle/fluid/framework/variable_helper.h      |   1 -
 .../fluid/imperative/gradient_accumulator.cc  |   1 -
 .../fluid/inference/analysis/analysis_pass.h  |   1 -
 paddle/fluid/inference/analysis/helper.cc     |   1 -
 paddle/fluid/inference/analysis/helper.h      |   1 -
 .../analysis/passes/memory_optimize_pass.cc   |   1 +
 .../fluid/inference/api/analysis_predictor.cc |   9 +-
 .../inference/api/paddle_pass_builder.cc      |   5 +-
 paddle/fluid/inference/capi/pd_predictor.cc   |   9 +-
 paddle/fluid/inference/engine.h               |   1 -
 .../inference/tensorrt/convert/CMakeLists.txt |   7 +
 .../tensorrt/convert/affine_channel_op.cc     |  94 ++
 .../tensorrt/convert/batch_norm_op.cc         |  42 +-
 .../tensorrt/convert/multiclass_nms_op.cc     | 133 +++
 .../tensorrt/convert/roi_align_op.cc          |  86 ++
 .../inference/tensorrt/convert/yolo_box_op.cc |  79 ++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  87 ++
 .../inference/tensorrt/plugin/CMakeLists.txt  |   2 +
 .../tensorrt/plugin/roi_align_op_plugin.cu    | 380 ++++++++
 .../tensorrt/plugin/roi_align_op_plugin.h     | 112 +++
 .../tensorrt/plugin/yolo_box_op_plugin.cu     | 404 ++++++++
 .../tensorrt/plugin/yolo_box_op_plugin.h      | 117 +++
 .../fluid/inference/tests/api/CMakeLists.txt  |  10 +-
 .../tests/api/lite_mul_model_test.cc          |   7 +-
 paddle/fluid/memory/memcpy.cc                 |   6 +-
 paddle/fluid/operators/activation_op.cu       |  55 ++
 paddle/fluid/operators/activation_op.h        |   4 +-
 paddle/fluid/operators/cast_op.h              |   1 -
 paddle/fluid/operators/cast_op_xpu.cc         |  40 +-
 paddle/fluid/operators/conv_cudnn_op_cache.h  |   5 -
 paddle/fluid/operators/cudnn_lstm_cache.h     |  10 +-
 paddle/fluid/operators/cudnn_rnn_cache.h      |   7 -
 .../detection/polygon_box_transform_op.cu     |   3 +-
 .../operators/distributed_ops/recv_save_op.cc |   1 -
 paddle/fluid/operators/dot_op.h               |   2 +-
 paddle/fluid/operators/fake_quantize_op.cc    |  69 +-
 paddle/fluid/operators/fake_quantize_op.cu    |   4 +-
 paddle/fluid/operators/fake_quantize_op.h     |  16 +-
 .../fused_embedding_eltwise_layernorm_op.cu   |   1 -
 .../fluid/operators/fused/fusion_lstm_op.cc   |  12 +
 .../fused/mkldnn/fusion_lstm_mkldnn_op.cc     |  19 +-
 paddle/fluid/operators/inplace_abn_op.cc      |   1 -
 paddle/fluid/operators/jit/benchmark.cc       |   5 +-
 paddle/fluid/operators/jit/test.cc            |   5 +-
 .../fluid/operators/math/concat_and_split.cu  |  48 +-
 paddle/fluid/operators/matmul_op.cc           |   2 +-
 paddle/fluid/operators/matmul_op_xpu.cc       |  77 +-
 paddle/fluid/operators/matmul_v2_op_xpu.cc    |  62 +-
 .../operators/mkldnn/concat_mkldnn_op.cc      |  20 +-
 paddle/fluid/operators/nll_loss_op.h          |   5 +-
 paddle/fluid/operators/one_hot_op.cc          |   1 -
 paddle/fluid/operators/one_hot_op_xpu.cc      |   1 -
 paddle/fluid/operators/one_hot_v2_op.cc       |   1 -
 paddle/fluid/operators/one_hot_v2_op_xpu.cc   |   1 -
 .../fluid/operators/optimizers/adam_op_xpu.cc |  22 +-
 .../operators/reader/create_py_reader_op.cc   |   1 -
 paddle/fluid/operators/reader/read_op.cc      |   1 -
 paddle/fluid/operators/recurrent_op.cc        |  25 +-
 paddle/fluid/operators/reshape_op.cc          |  28 +-
 paddle/fluid/operators/save_combine_op.h      |   1 -
 paddle/fluid/operators/save_op.h              |   1 -
 paddle/fluid/operators/set_value_op.cc        |  11 +-
 paddle/fluid/operators/set_value_op.h         |  72 +-
 .../softmax_with_cross_entropy_op_xpu.cc      |  18 +-
 paddle/fluid/operators/stack_op.h             |  12 +-
 paddle/fluid/operators/warpctc_op.h           |   3 +-
 paddle/fluid/platform/cudnn_helper.h          |  38 -
 paddle/fluid/platform/eigen_ext.h             |  97 +-
 paddle/fluid/platform/flags.cc                |  12 +
 .../pybind/global_value_getter_setter.cc      |   4 +-
 paddle/fluid/pybind/imperative.cc             |  14 +-
 paddle/fluid/pybind/op_function_generator.cc  |   6 +-
 paddle/fluid/pybind/pybind.cc                 |   3 -
 paddle/scripts/paddle_build.bat               |   5 +-
 paddle/scripts/paddle_build.sh                |   4 +-
 python/paddle/{README.md => README.rst}       |   0
 .../fleet/data_generator/data_generator.py    |  26 +-
 .../fleet/meta_optimizers/common.py           |  41 +-
 .../meta_optimizers/pipeline_optimizer.py     | 307 +++---
 python/paddle/fluid/__init__.py               |   1 +
 .../mixed_precision/bf16/__init__.py}         |  23 +-
 .../contrib/mixed_precision/bf16/amp_lists.py |  97 ++
 .../contrib/mixed_precision/bf16/amp_utils.py | 296 ++++++
 .../contrib/mixed_precision/fp16_utils.py     |  10 +-
 .../slim/quantization/imperative/qat.py       | 156 ++-
 .../slim/quantization/imperative/quant_nn.py  | 111 ++-
 .../slim/quantization/imperative/utils.py     |  66 ++
 .../slim/tests/test_imperative_out_scale.py   |  39 +-
 .../contrib/slim/tests/test_imperative_qat.py |  26 +-
 .../test_imperative_qat_addquantdequant.py    |  27 +-
 python/paddle/fluid/dataloader/collate.py     |  47 +-
 python/paddle/fluid/device_worker.py          |   2 +-
 python/paddle/fluid/dygraph/container.py      |  22 +-
 .../dygraph_to_static/loop_transformer.py     |   2 +-
 .../fluid/dygraph/dygraph_to_static/utils.py  |  12 +-
 python/paddle/fluid/executor.py               |  23 +-
 python/paddle/fluid/framework.py              |  42 +-
 python/paddle/fluid/layers/nn.py              |   4 +
 python/paddle/fluid/optimizer.py              | 900 ++++++++++++------
 python/paddle/fluid/reader.py                 |   6 +
 python/paddle/fluid/tests/CMakeLists.txt      |   3 +-
 .../fluid/tests/custom_op/CMakeLists.txt      |  45 +-
 .../fluid/tests/custom_op/custom_relu_op.cc   |   3 +-
 .../paddle/fluid/tests/custom_op/relu_op.cc   | 115 ---
 .../paddle/fluid/tests/custom_op/relu_op.cu   |  87 --
 .../paddle/fluid/tests/custom_op/relu_op3.cc  | 115 ---
 .../paddle/fluid/tests/custom_op/relu_op3.cu  |  87 --
 .../fluid/tests/custom_op/setup_build.py      |  37 -
 .../fluid/tests/custom_op/test_custom_op.py   | 120 ---
 .../custom_op/test_custom_relu_op_jit.py      |   4 +-
 .../fluid/tests/custom_op/test_jit_load.py    |  51 -
 .../fluid/tests/custom_op/test_setup_build.py |  69 --
 .../tests/custom_op/test_setup_install.py     |  65 --
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../dygraph_to_static/test_for_enumerate.py   |  29 +-
 .../ir/inference/inference_pass_test.py       |   3 +
 .../inference/test_trt_affine_channel_op.py   | 141 +++
 .../inference/test_trt_multiclass_nms_op.py   | 144 +++
 .../ir/inference/test_trt_roi_align_op.py     | 119 +++
 .../ir/inference/test_trt_scale_op.py         |  28 +
 .../ir/inference/test_trt_yolo_box_op.py      |  76 ++
 .../mkldnn/test_fusion_lstm_int8_mkldnn_op.py | 153 +++
 .../fluid/tests/unittests/pipeline_mnist.py   |  32 +-
 .../tests/unittests/test_activation_op.py     |   6 +-
 .../fluid/tests/unittests/test_conv2d_op.py   |  36 +
 .../unittests/test_cross_entropy_loss.py      | 638 ++++++++++++-
 .../tests/unittests/test_data_generator.py    |  40 +
 .../tests/unittests/test_fake_quantize_op.py  |   5 +-
 .../test_imperative_container_layerlist.py    |  12 +
 .../tests/unittests/test_lr_scheduler.py      |  12 +
 .../fluid/tests/unittests/test_matmul_op.py   |  36 +
 .../tests/unittests/test_matmul_v2_op.py      |   7 +-
 .../tests/unittests/test_set_value_op.py      |  59 +-
 .../fluid/tests/unittests/test_var_base.py    |   5 +
 .../fluid/tests/unittests/test_warpctc_op.py  |  29 +-
 .../tests/unittests/xpu/test_cast_op_xpu.py   |   8 +-
 .../tests/unittests/xpu/test_matmul_op_xpu.py |  58 +-
 .../unittests/xpu/test_matmul_v2_op_xpu.py    | 205 ++--
 python/paddle/incubate/__init__.py            |   1 -
 python/paddle/nn/functional/loss.py           | 382 ++++++--
 python/paddle/nn/functional/norm.py           |   9 +-
 python/paddle/nn/layer/common.py              |   4 +-
 python/paddle/nn/layer/conv.py                |   5 +
 python/paddle/nn/layer/loss.py                | 273 ++++--
 python/paddle/optimizer/adam.py               |   2 +-
 python/paddle/optimizer/adamax.py             |   2 +-
 python/paddle/optimizer/lr.py                 |   5 +-
 python/paddle/optimizer/optimizer.py          |  19 +-
 python/paddle/tensor/creation.py              |   2 +-
 python/paddle/utils/__init__.py               |   3 +-
 .../utils/cpp_extension/cpp_extension.py      |   8 +-
 .../utils/cpp_extension/extension_utils.py    |  41 +-
 python/setup.py.in                            |  44 +-
 tools/static_mode_white_list.py               |   1 +
 tools/test_op_benchmark.sh                    |   4 +-
 195 files changed, 6752 insertions(+), 2507 deletions(-)
 delete mode 100644 paddle/fluid/framework/c/c_api.cc
 delete mode 100644 paddle/fluid/framework/c/c_api.h
 delete mode 100644 paddle/fluid/framework/load_op_lib.h
 create mode 100644 paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
 rename python/paddle/{README.md => README.rst} (100%)
 rename python/paddle/fluid/{tests/custom_op/setup_install.py => contrib/mixed_precision/bf16/__init__.py} (46%)
 create mode 100644 python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
 create mode 100644 python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
 delete mode 100644 python/paddle/fluid/tests/custom_op/relu_op.cc
 delete mode 100644 python/paddle/fluid/tests/custom_op/relu_op.cu
 delete mode 100644 python/paddle/fluid/tests/custom_op/relu_op3.cc
 delete mode 100644 python/paddle/fluid/tests/custom_op/relu_op3.cu
 delete mode 100644 python/paddle/fluid/tests/custom_op/setup_build.py
 delete mode 100644 python/paddle/fluid/tests/custom_op/test_custom_op.py
 delete mode 100644 python/paddle/fluid/tests/custom_op/test_jit_load.py
 delete mode 100644 python/paddle/fluid/tests/custom_op/test_setup_build.py
 delete mode 100644 python/paddle/fluid/tests/custom_op/test_setup_install.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 676c94591e..1aedc9e033 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.15)
+cmake_policy(VERSION 3.10)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@@ -38,11 +39,6 @@ endif()
 if (WITH_GPU  AND WITH_ASCEND)
     message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
-# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
-if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
-    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
-       "You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/")
-endif()
 
 if(WITH_GPU AND NOT APPLE)
     enable_language(CUDA)
@@ -61,7 +57,10 @@ if(WITH_MUSL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()
 
+<<<<<<< HEAD
 #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zm1000 /fp:fast")
+=======
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
 if(WIN32)
     option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index c4d1384312..05b5595207 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -74,7 +74,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
   # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual")
   set(archs_name_default "Auto")
   list(APPEND archs_names "Auto")
 
@@ -108,6 +108,8 @@ function(select_nvcc_arch_flags out_variable)
     set(cuda_arch_bin "70")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
+    set(cuda_arch_bin "80")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -206,14 +208,11 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
 message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}")
 
-# Set C++11 support
+# Set C++14 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-if (NOT WIN32) # windows msvc2015 support c++11 natively.
-    # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
-  set(CMAKE_CUDA_STANDARD 11)
-endif(NOT WIN32)
+set(CMAKE_CUDA_STANDARD 14)
 
 # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
 # So replace /W[1-4] with /W0
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index d8d8f634e7..c82847100a 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file)
                 "${CUDNN_MAJOR_VERSION} * 1000 +
                  ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
             message(STATUS "Current cuDNN header is ${cudnn_header_file} "
-              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
+              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ")
         endif()
     endif()
 endmacro()
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 5a755a816c..f68db1eab3 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -14,11 +14,11 @@
 
 include(ExternalProject)
 
-# update eigen to the commit id 4da2c6b1 on 03/19/2020
+# update eigen to the commit id f612df27 on 03/16/2021
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
 set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
-set(EIGEN_TAG        4da2c6b1974827b1999bab652a3d4703e1992d26)
+set(EIGEN_TAG        f612df273689a19d25b45ca4f8269463207c4fee)
 
 cache_third_party(extern_eigen3
     REPOSITORY    ${EIGEN_REPOSITORY}
@@ -27,48 +27,6 @@ cache_third_party(extern_eigen3
 
 if(WIN32)
     add_definitions(-DEIGEN_STRONG_INLINE=inline)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
-    # For Windows
-    # which will cause a compilation error in Tensor:74:
-    # "can not open file 'unistd.h'"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2)
-    # For VS2015
-    # which will cause a compilation error in TensorBlock.h:1028:
-    # "syntax error"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3)
-    set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y)
-elseif(LINUX)
-    # For gxx=4.8, __GXX_ABI_VERSION is less than 1004
-    # which will cause a compilation error in Geometry_SSE.h:38:
-    # "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)"
-    # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60
-    # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8
-    # so use following patch to solve compilation error with different version of gcc.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1)
-    # The compiler fully support const expressions since c++14,
-    # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11
-    # add patch to avoid compilation error in c++11
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2)
-    if(WITH_ROCM)
-        # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
-        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3)
-        # For HIPCC Eigen::internal::scalar_sum_op<bool,bool> is not EIGEN_DEVICE_FUNC
-        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4)
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4})
-    else()
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2})
-    endif()
 endif()
 
 set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
@@ -82,7 +40,7 @@ ExternalProject_Add(
     PREFIX          ${EIGEN_PREFIX_DIR}
     SOURCE_DIR      ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND    ""
-    PATCH_COMMAND   ${EIGEN_PATCH_COMMAND}
+    PATCH_COMMAND     ""
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""
     INSTALL_COMMAND   ""
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index b0ef575f64..8d66118fcd 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -14,11 +14,19 @@
 
 INCLUDE(ExternalProject)
 
+IF(WITH_ROCM)
+    add_definitions(-DWARPCTC_WITH_HIP)
+ENDIF()
+
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
+<<<<<<< HEAD
 set(WARPCTC_TAG         cd828e5b6c3b953b82af73f7f44cddc393a20efa)
+=======
+set(WARPCTC_TAG         c690fc5755abbdbdc98ef78d51ec10a6748a8cd1)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
@@ -57,6 +65,7 @@ ExternalProject_Add(
                     -DCMAKE_CXX_FLAGS_DEBUG=$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
                     -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
                     -DWITH_GPU=${WITH_GPU}
+                    -DWITH_ROCM=${WITH_ROCM}
                     -DWITH_OMP=${USE_OMP}
                     -DWITH_TORCH=OFF
                     -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index b5a3f01547..16c69a7b50 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT)
   elseif(WITH_SUNWAY)
       SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
   else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_02_27.tar.gz" CACHE STRING "" FORCE)
+      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_03_30.tar.gz" CACHE STRING "" FORCE)
   endif()
 
   SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index e110524dd1..a2ddad557c 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -4,10 +4,10 @@ include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
 include(CheckTypeSize)
 
-function(CheckCompilerCXX11Flag)
+function(CheckCompilerCXX14Flag)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
+        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4)
+            message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.")
         elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
             message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2")
         endif()
@@ -20,23 +20,15 @@ function(CheckCompilerCXX11Flag)
                 message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
             endif()
         else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
+            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4)
+                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.")
             endif()
         endif()
     endif()
 endfunction()
 
-CheckCompilerCXX11Flag()
-if (WITH_GPU)
-    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
-       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
-    else()
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-    endif()
-else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-endif()
+CheckCompilerCXX14Flag()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
diff --git a/go/README_cn.md b/go/README_cn.md
index a184ecbb8d..040540e939 100644
--- a/go/README_cn.md
+++ b/go/README_cn.md
@@ -50,6 +50,7 @@ output_data := value.Interface().([][]float32)
 
 运行
 ```bash
+go mod init github.com/paddlepaddle
 export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH
 go run ./demo/mobilenet.go
 ```
diff --git a/go/demo/mobilenet.go b/go/demo/mobilenet.go
index 1b42fe8049..c1ca2e967f 100644
--- a/go/demo/mobilenet.go
+++ b/go/demo/mobilenet.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 package main
 
-import "../paddle"
+import "github.com/paddlepaddle/paddle"
 import "strings"
 import "io/ioutil"
 import "strconv"
diff --git a/go/paddle/common.go b/go/paddle/common.go
index 4bf9476593..cbbde6a45f 100644
--- a/go/paddle/common.go
+++ b/go/paddle/common.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <paddle_c_api.h>
 import "C"
diff --git a/go/paddle/config.go b/go/paddle/config.go
index 89f7d7e63f..68a3123099 100644
--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <paddle_c_api.h>
diff --git a/go/paddle/predictor.go b/go/paddle/predictor.go
index 59bad908e6..5f2b2c81a6 100644
--- a/go/paddle/predictor.go
+++ b/go/paddle/predictor.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include "paddle_c_api.h"
 import "C"
@@ -88,7 +88,7 @@ func (predictor *Predictor) GetInputNames() []string {
 }
 
 func (predictor *Predictor) GetOutputNames() []string {
-	names := make([]string, predictor.GetInputNum())
+	names := make([]string, predictor.GetOutputNum())
 	for i := 0; i < len(names); i++ {
 		names[i] = predictor.GetOutputName(i)
 	}
diff --git a/go/paddle/tensor.go b/go/paddle/tensor.go
index e6e2c53fef..6fbcf039f8 100644
--- a/go/paddle/tensor.go
+++ b/go/paddle/tensor.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <string.h>
@@ -209,7 +209,7 @@ func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Va
 		value := reflect.Indirect(ptr)
 		value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0])))
 		if len(shape) == 1 && value.Len() > 0 {
-			switch value.Index(1).Kind() {
+			switch value.Index(0).Kind() {
 			case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32:
 				binary.Read(r, Endian(), value.Interface())
 				return
diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h
index be492a6d55..52606b2a7f 100644
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
@@ -52,6 +52,9 @@ class PD_DLL_DECL Tensor {
   /// \brief Construct a Tensor on target Place for CustomOp.
   /// Generally it's only used for user to create Tensor.
   explicit Tensor(const PlaceType& place);
+  /// \brief Construct a Tensor on target Place with shape for CustomOp.
+  /// Generally it's only used for user to create Tensor.
+  Tensor(const PlaceType& place, const std::vector<int64_t>& shape);
   /// \brief Reset the shape of the tensor.
   /// Generally it's only used for the input tensor.
   /// Reshape must be called before calling
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
index 0cae8f4af7..e9705e2101 100644
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -102,13 +102,32 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
 
 void Tensor::reshape(const std::vector<int64_t> &shape) {
   GET_CASTED_TENSOR
-  tensor->Resize(framework::make_ddim(shape));
+  auto new_dim = framework::make_ddim(shape);
+  if (tensor->numel() != framework::product(new_dim)) {
+    LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger "
+                    "or smaller"
+                 << "than original shape will not change your tensor's memory "
+                    "Please call"
+                 << "paddle::Tensor::mutable_data<T>() after to reallocate "
+                    "your tensor's size."
+                 << std::endl;
+  }
+  tensor->Resize(new_dim);
 }
 
 Tensor::Tensor(const PlaceType &place)
     : tensor_(std::make_shared<framework::LoDTensor>()),
       place_(place),
       stream_(StreamWrapper()) {}
+
+Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
+    : tensor_(std::make_shared<framework::LoDTensor>()),
+      place_(place),
+      stream_(StreamWrapper()) {
+  GET_CASTED_TENSOR
+  tensor->Resize(framework::make_ddim(shape));
+}
+
 template <typename T>
 T *Tensor::mutable_data(const PlaceType &place) {
   place_ = place;
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 1fa4ce9b57..2842f230ca 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -360,46 +360,11 @@ set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_prot
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
-# Old custom op extension mechanism related, will be removed in 2.1.0
-cc_library(paddle_framework_shared
-    SHARED SRCS executor.cc operator.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc
-    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc
-    DEPS ${FLUID_FRAMEWORK_MODULES})
-get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-set_target_properties(paddle_framework_shared PROPERTIES OUTPUT_NAME paddle_framework)
-target_link_libraries(paddle_framework_shared ${os_dependency_modules})
-
-if (LINUX)
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.so
-      CACHE INTERNAL "Fluid framework lib")
-endif()
-
-if (WIN32)
-  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR})
-  else()
-    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
-  endif()
-  set(FLUID_FRAMEWORK_IMPORT_LIB
-      ${paddle_framework_lib_path}/paddle_framework.lib
-      CACHE INTERNAL "Fluid framework lib")
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${paddle_framework_lib_path}/paddle_framework.dll
-      CACHE INTERNAL "Fluid framework dll")
-endif()
-
-if(APPLE)
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dylib
-      CACHE INTERNAL "Fluid framework lib")
-endif()
 if(WITH_TESTING AND TEST selected_rows_test)
   set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()
 
-# New custom op extension mechanism related
+##### 2.0 New custom op extension mechanism related #####
 
 # if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_
 set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc
deleted file mode 100644
index 5e73c5cc23..0000000000
--- a/paddle/fluid/framework/c/c_api.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/c/c_api.h"
-
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-extern "C" {
-
-paddle::framework::OpInfoMap &PD_GetOpInfoMap() {
-  return paddle::framework::OpInfoMap::Instance();
-}
-
-void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool) {
-  paddle::platform::DeviceContextPool::SetPool(pool);
-}
-
-std::vector<std::string> PD_GetGradOpDescStrs(
-    const paddle::framework::OpDesc &op_desc,
-    const std::unordered_set<std::string> &no_grad_set,
-    std::unordered_map<std::string, std::string> *grad_to_var,
-    const std::vector<paddle::framework::BlockDesc *> &grad_block) {
-  auto &op_info = PD_GetOpInfoMap().Get(op_desc.Type());
-  std::vector<std::string> ret;
-  if (op_info.grad_op_maker_) {
-    auto grad_op_descs =
-        op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, grad_block);
-    size_t op_num = grad_op_descs.size();
-    ret.resize(op_num);
-    for (size_t i = 0; i < op_num; ++i) {
-      PADDLE_ENFORCE_EQ(
-          grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
-          paddle::platform::errors::Unavailable(
-              "Cannot serialize operator desc message."));
-    }
-  }
-  return ret;
-}
-
-}  // end extern "C"
diff --git a/paddle/fluid/framework/c/c_api.h b/paddle/fluid/framework/c/c_api.h
deleted file mode 100644
index a9ec402f38..0000000000
--- a/paddle/fluid/framework/c/c_api.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* copyright (c) 2019 paddlepaddle authors. all rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class OpInfoMap;
-}  // namespace framework
-namespace platform {
-class DeviceContextPool;
-}  // namespace platform
-}  // namespace paddle
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C-API to get global OpInfo map.
-paddle::framework::OpInfoMap &PD_GetOpInfoMap();
-
-// C-API to init global DeviceContextPool from outside.
-void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool);
-
-// C-API to serialize the grad op protocol message to a binary string.
-std::vector<std::string> PD_GetGradOpDescStrs(
-    const paddle::framework::OpDesc &op_desc,
-    const std::unordered_set<std::string> &no_grad_set,
-    std::unordered_map<std::string, std::string> *grad_to_var,
-    const std::vector<paddle::framework::BlockDesc *> &grad_block);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 1ebb8998c8..97d58df6dc 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/extension/include/ext_tensor.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/custom_tensor_utils.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h
index fad1e3ee34..809a6b965a 100644
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -37,7 +37,7 @@ class CustomTensorUtils {
   /// \brief Share data FROM another tensor.
   /// Use this to pass tensor from op to op
   /// \return void.
-  static void ShareDataFrom(const void* src, const Tensor& dst);
+  static void ShareDataFrom(const void* src, const paddle::Tensor& dst);
 
   static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType(
       const paddle::DataType& dtype) {
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index c8bc735790..c06a3d4a18 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -18,7 +18,6 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 84c6b03e76..44069f61d9 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -34,15 +34,19 @@ namespace patterns {
 static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name,
                                const std::string& arg,
                                bool is_persist = false) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   PDNode* node =
-      pattern->NewNode(name)->assert_is_op_input("lookup_table", arg);
+      pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg);
   if (is_persist) return node->assert_is_persistable_var();
   return node;
 }
 static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name,
                                    const std::string& arg) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   PDNode* node = pattern->NewNode(name)
-                     ->assert_is_only_output_of_op("lookup_table")
+                     ->assert_is_only_output_of_ops(embedding_ops)
                      ->assert_is_op_input("elementwise_add", arg)
                      ->AsIntermediate();
   return node;
@@ -56,10 +60,12 @@ void Embedding2Eltwise1Pattern::operator()() {
       create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
   auto* lookup_table2_w =
       create_emb_vars(pattern, lookup_table2_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   auto* lookup_table1 =
-      pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
   auto* lookup_table2 =
-      pattern->NewNode(lookup_table2_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops);
   auto* lookup_table1_out =
       create_emb_out_vars(pattern, lookup_table1_out_repr(), "X");
   auto* lookup_table2_out =
@@ -80,8 +86,10 @@ void Embedding1Eltwise1Pattern::operator()() {
       create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
   auto* lookup_table1_w =
       create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   auto* lookup_table1 =
-      pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
   auto* lookup_table1_out =
       create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y");
   auto* eltwise_add =
@@ -347,4 +355,5 @@ REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("lookup_table", 0)
+            .LE("lookup_table_v2", 1)
             .EQ("elementweise_add", 0));
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index deb182c0fb..d74e8e5f65 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -652,6 +652,36 @@ PDNode *PDNode::assert_is_ops_input(
   return this;
 }
 
+PDNode *PDNode::assert_is_only_input_of_ops(
+    const std::unordered_set<std::string> &op_types) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node *x) {
+    for (auto *op : x->outputs) {
+      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) &&
+          op->inputs.size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+
+PDNode *PDNode::assert_is_only_output_of_ops(
+    const std::unordered_set<std::string> &op_types) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node *x) {
+    for (auto *op : x->inputs) {
+      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) &&
+          op->outputs.size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+
 bool VarLinksToOp(Node *node, const std::string &op_type) {
   for (auto *out : node->outputs) {
     if (out->IsOp() && out->Op()->Type() == op_type) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 2e518c1d4d..cfac01ec9d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -28,7 +28,6 @@
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@@ -146,6 +145,11 @@ struct PDNode {
       const std::unordered_set<std::string>& op_types,
       const std::string& argument, int nth);
 
+  PDNode* assert_is_only_input_of_ops(
+      const std::unordered_set<std::string>& op_types);
+  PDNode* assert_is_only_output_of_ops(
+      const std::unordered_set<std::string>& op_types);
+
   PDNode* assert_has_n_inputs(size_t n);
   PDNode* assert_has_n_outputs(size_t n);
 
diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
index 69edc3d87f..18d2e9817e 100644
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
@@ -14,7 +14,6 @@
 
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
index 5fd47b2173..5fe71fbc21 100644
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
@@ -17,7 +17,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_test_util.h"
 #include "paddle/fluid/framework/naive_executor.h"
diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
index a2443c8698..c36123f65f 100644
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
@@ -57,7 +57,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
     std::vector<int64_t> y_shape = matmul_in_y->Var()->GetShape();
     size_t x_rank = x_shape.size();
     size_t y_rank = y_shape.size();
-    flag = flag && x_rank == 2 && y_rank == 2;
+    flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2;
 
     std::vector<Node*>& next_ops = matmul_out->outputs;
     flag = flag && next_ops.size() == 1 &&
@@ -69,7 +69,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
       desc.SetInput("X", {matmul_in_x->Name()});
       desc.SetInput("Y", {matmul_in_y->Name()});
       desc.SetOutput("Out", {matmul_out->Name()});
-      desc.SetAttr("x_num_col_dims", 1);
+      desc.SetAttr("x_num_col_dims", static_cast<int>(x_rank - 1));
       desc.SetAttr("y_num_col_dims", 1);
       if (matmul_op->Op()->HasAttr("enable_int8")) {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index e20c0667ec..e8f4dbd295 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -682,6 +682,447 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
   return fusion_count;
 }
 
+PDNode* MultiHeadMatmulV3Pattern::operator()() {
+  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("matmul");
+
+  // First path with scale
+  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matmul");
+  auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "Y");
+  auto* mul0_out_var =
+      pattern->NewNode(mul0_out_repr())->assert_is_op_output("matmul");
+
+  decltype(mul0) eltadd0;
+  decltype(mul0) eltadd0_b_var;
+  decltype(mul0) eltadd0_out_var;
+
+  mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_0_out_var =
+      pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2");
+  reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul");
+
+  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add");
+  eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var =
+      pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
+  softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
+
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops);
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
+                                   ->assert_is_op_output("reshape2");
+  reshape2_qkv_out_var->assert_is_op_input("matmul");
+
+  // Second path to matmul
+  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matmul");
+  auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "Y");
+  auto* mul1_out_var =
+      pattern->NewNode(mul1_out_repr())->assert_is_op_output("matmul");
+
+  decltype(mul1) eltadd1;
+  decltype(mul1) eltadd1_b_var;
+  decltype(mul1) eltadd1_out_var;
+
+  mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+  eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
+  eltadd1_b_var = pattern->NewNode(eltadd1_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd1_out_var = pattern->NewNode(eltadd1_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_1 =
+      pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_1_out_var =
+      pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2");
+  reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_1 =
+      pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
+  auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
+      "matmul");  // link to matmul qk
+
+  // Third path to matmul
+  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul");
+  auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "Y");
+  auto* mul2_out_var =
+      pattern->NewNode(mul2_out_repr())->assert_is_op_output("matmul");
+
+  decltype(mul2) eltadd2;
+  decltype(mul2) eltadd2_b_var;
+  decltype(mul2) eltadd2_out_var;
+
+  mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+  eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add");
+  eltadd2_b_var = pattern->NewNode(eltadd2_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd2_out_var = pattern->NewNode(eltadd2_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_2 =
+      pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_2_out_var =
+      pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2");
+  reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_2 =
+      pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
+  auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_2_out_var->AsIntermediate()->assert_is_ops_input(
+      matmul_ops);  // link to matmul qkv
+
+  // Q path
+  mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
+  eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var});
+
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+  // K path
+  mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var});
+  eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var});
+  reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var});
+  transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var});
+  // compute q*k
+  matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+  // V  path
+  mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var});
+  eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var});
+  reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var});
+  transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var});
+  // compute q*k*v
+  matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+
+  return transpose2_2_out_var;
+}
+
+static int BuildFusionV3(Graph* graph, const std::string& name_scope,
+                         Scope* scope) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  MultiHeadMatmulV3Pattern multihead_pattern(pattern, name_scope);
+
+  multihead_pattern();
+  // Create New OpDesc
+  auto fuse_creater = [&](
+      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
+      Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
+      Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
+      Node* reshape2, Node* reshape2_qkv_out, Node* matmul_qk) {
+    auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha"));
+
+    // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
+    // bias (B * S * 3 * N * H) + bias (3 * N * H)
+    // Transpose (B * S * 3 * N * H) -> (3 * B * N * S * H)
+    auto* wq_tensor = scope->FindVar(mul0_w->Name())->GetMutable<LoDTensor>();
+    auto* wk_tensor = scope->FindVar(mul1_w->Name())->GetMutable<LoDTensor>();
+    auto* wv_tensor = scope->FindVar(mul2_w->Name())->GetMutable<LoDTensor>();
+
+    auto* bq_tensor =
+        scope->FindVar(eltadd0_b->Name())->GetMutable<LoDTensor>();
+    auto* bk_tensor =
+        scope->FindVar(eltadd1_b->Name())->GetMutable<LoDTensor>();
+    auto* bv_tensor =
+        scope->FindVar(eltadd2_b->Name())->GetMutable<LoDTensor>();
+
+    auto* wq_data = wq_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* wk_data = wk_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* wv_data = wv_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bq_data = bq_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bk_data = bk_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bv_data = bv_tensor->mutable_data<float>(platform::CPUPlace());
+
+    auto combined_w_dims =
+        framework::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    auto combined_bias_dims = framework::make_ddim({3, bq_tensor->dims()[0]});
+
+    // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
+    auto* combined_w_desc = mul0_w->Var();
+    combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    combined_w_desc->SetPersistable(true);
+
+    auto* combined_bias_desc = eltadd0_b->Var();
+    combined_bias_desc->SetShape({3, bq_tensor->dims()[0]});
+    combined_bias_desc->SetPersistable(true);
+
+    framework::LoDTensor tmp_combined_w_tensor;
+    tmp_combined_w_tensor.Resize(combined_w_dims);
+    auto* tmp_combined_w_data =
+        tmp_combined_w_tensor.mutable_data<float>(platform::CPUPlace());
+
+    std::vector<float*> w_vec = {wq_data, wk_data, wv_data};
+    int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2];
+    // Combine the three fc weights together.
+    for (int i = 0; i < dims_h; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < dims_w; k++) {
+          int out_index = i * (3 * dims_w) + j * dims_w + k;
+          int in_index = i * dims_w + k;
+          tmp_combined_w_data[out_index] = w_vec[j][in_index];
+        }
+      }
+    }
+
+    wq_tensor->Resize(combined_w_dims);
+    auto* new_combined_w_data =
+        wq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_w_data, tmp_combined_w_data,
+           sizeof(float) * wq_tensor->numel());
+
+    scope->EraseVars({mul1_w->Name(), mul2_w->Name()});
+
+    framework::LoDTensor tmp_combined_bias_tensor;
+    tmp_combined_bias_tensor.Resize(combined_bias_dims);
+    auto* tmp_combined_bias_data =
+        tmp_combined_bias_tensor.mutable_data<float>(platform::CPUPlace());
+
+    size_t bias_size = bq_tensor->numel();
+    memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + bias_size, bk_data,
+           sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data,
+           sizeof(float) * bias_size);
+
+    bq_tensor->Resize(combined_bias_dims);
+    auto* new_combined_bias_data =
+        bq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_bias_data, tmp_combined_bias_data,
+           sizeof(float) * bq_tensor->numel());
+
+    scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()});
+
+    auto reshape_desc = reshape2->Op();
+    int head_number =
+        BOOST_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape")).at(2);
+
+    OpDesc multihead_op_desc;
+    multihead_op_desc.SetType("multihead_matmul");
+
+    multihead_op_desc.SetInput("Input", {input0->Name()});
+    multihead_op_desc.SetInput("W", {mul0_w->Name()});
+    multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()});
+    multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()});
+
+    multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()});
+    multihead_op_desc.SetAttr("alpha", scale_attr);
+    multihead_op_desc.SetAttr("head_number", head_number);
+
+    auto* multihead = graph->CreateOpNode(&multihead_op_desc);
+
+    IR_NODE_LINK_TO(input0, multihead);
+    IR_NODE_LINK_TO(mul0_w, multihead);
+    IR_NODE_LINK_TO(eltadd0_b, multihead);
+    IR_NODE_LINK_TO(eltadd_qk_b, multihead);
+
+    IR_NODE_LINK_TO(multihead, reshape2_qkv_out);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out,
+                              multihead_pattern);
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
+                              multihead_pattern);
+
+    // If weights or biases in qkv's fc are shared by multiple multihead_matmul
+    // patterns, we do not support this kind of fusion, this pass will not take
+    // effect.
+    bool is_fc_params_shared =
+        mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 ||
+        mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 ||
+        eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1;
+    if (is_fc_params_shared) {
+      return;
+    }
+    fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
+                 mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
+                 reshape2_0, reshape2_qkv_out, matmul_qk);
+
+    std::unordered_set<const Node*> marked_nodes({eltadd0,
+                                                  eltadd1,
+                                                  eltadd2,
+                                                  eltadd1_b,
+                                                  eltadd2_b,
+                                                  eltadd0_out,
+                                                  eltadd1_out,
+                                                  eltadd2_out,
+                                                  reshape2_0,
+                                                  reshape2_1,
+                                                  reshape2_2,
+                                                  reshape2_0_out,
+                                                  reshape2_1_out,
+                                                  reshape2_2_out,
+                                                  transpose2_0,
+                                                  transpose2_1,
+                                                  transpose2_2,
+                                                  transpose2_0_out,
+                                                  transpose2_1_out,
+                                                  transpose2_2_out,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  mul0,
+                                                  mul1,
+                                                  mul2,
+                                                  mul0_out,
+                                                  mul1_out,
+                                                  mul2_out,
+                                                  mul1_w,
+                                                  mul2_w,
+                                                  reshape2_qkv});
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
 }  // namespace patterns
 
 void MultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
@@ -706,6 +1147,21 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
   AddStatis(fusion_count);
 }
 
+void MultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal(
+          "During the multiheadMatmul pass, The scope should not be null."));
+
+  int fusion_count = patterns::BuildFusionV3(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kMultiheadMatmulPass, new bool(true));
+  }
+  AddStatis(fusion_count);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
@@ -715,6 +1171,8 @@ REGISTER_PASS(multihead_matmul_fuse_pass,
 
 REGISTER_PASS(multihead_matmul_fuse_pass_v2,
               paddle::framework::ir::MultiHeadMatmulV2FusePass);
+REGISTER_PASS(multihead_matmul_fuse_pass_v3,
+              paddle::framework::ir::MultiHeadMatmulV3FusePass);
 REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
@@ -725,3 +1183,13 @@ REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
             .EQ("scale", 0)
             .LE("matmul", 1)
             .EQ("softmax", 0));
+REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v3)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
index f5327dc710..c7f1336211 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
@@ -89,9 +89,63 @@ struct MultiHeadMatmulPattern : public PatternBase {
   PATTERN_DECL_NODE(matmul_qkv);
   PATTERN_DECL_NODE(matmul_qkv_out);
 };
+
+struct MultiHeadMatmulV3Pattern : public PatternBase {
+  MultiHeadMatmulV3Pattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "multihead_matmul_v3") {}
+
+  PDNode* operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(mul0);
+  PATTERN_DECL_NODE(mul1);
+  PATTERN_DECL_NODE(mul2);
+  PATTERN_DECL_NODE(mul0_w);
+  PATTERN_DECL_NODE(mul1_w);
+  PATTERN_DECL_NODE(mul2_w);
+  PATTERN_DECL_NODE(mul0_out);
+  PATTERN_DECL_NODE(mul1_out);
+  PATTERN_DECL_NODE(mul2_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(eltadd1_out);
+  PATTERN_DECL_NODE(eltadd2_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_1);
+  PATTERN_DECL_NODE(reshape2_2);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(reshape2_1_out);
+  PATTERN_DECL_NODE(reshape2_2_out);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_1);
+  PATTERN_DECL_NODE(transpose2_2);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_0_out);
+  PATTERN_DECL_NODE(transpose2_1_out);
+  PATTERN_DECL_NODE(transpose2_2_out);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+};
+
 }  // namespace patterns
 
-// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
 class MultiHeadMatmulFusePass : public FusePassBase {
  public:
   virtual ~MultiHeadMatmulFusePass() {}
@@ -112,6 +166,16 @@ class MultiHeadMatmulV2FusePass : public FusePassBase {
   const std::string name_scope_{"multihead_matmul_fuse_v2"};
 };
 
+class MultiHeadMatmulV3FusePass : public FusePassBase {
+ public:
+  virtual ~MultiHeadMatmulV3FusePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"multihead_matmul_fuse_v3"};
+};
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/load_op_lib.h b/paddle/fluid/framework/load_op_lib.h
deleted file mode 100644
index 16cffe119d..0000000000
--- a/paddle/fluid/framework/load_op_lib.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-T *DynLoad(void *handle, std::string name) {
-  T *func = reinterpret_cast<T *>(dlsym(handle, name.c_str()));
-#if !defined(_WIN32)
-  auto errorno = dlerror();
-#else
-  auto errorno = GetLastError();
-#endif  // !_WIN32
-  PADDLE_ENFORCE_NOT_NULL(
-      func,
-      platform::errors::NotFound(
-          "Failed to load dynamic operator library, error code(%s).", errorno));
-  return func;
-}
-
-void LoadOpLib(const std::string &dso_name) {
-  void *handle = paddle::platform::dynload::GetOpDsoHandle(dso_name);
-
-  typedef OpInfoMap &get_op_info_t();
-  get_op_info_t *get_op_info =
-      DynLoad<get_op_info_t>(handle, "PD_GetOpInfoMap");
-  auto &op_info = get_op_info();
-  auto *dyn_info_map = op_info.mutable_map();
-
-  typedef std::vector<std::string> grad_op_desc_maker_t(
-      const OpDesc &, const std::unordered_set<std::string> &,
-      std::unordered_map<std::string, std::string> *,
-      const std::vector<BlockDesc *> &);
-
-  grad_op_desc_maker_t *grad_op_desc_maker =
-      DynLoad<grad_op_desc_maker_t>(handle, "PD_GetGradOpDescStrs");
-
-  auto &info_map = OpInfoMap::Instance();
-  for (const auto &n : *(dyn_info_map)) {
-    auto type = n.first;
-    if (type == "recurrent" || type == "recurrent_grad" ||
-        type == "conditional_block" || type == "conditional_block_grad") {
-      continue;
-    }
-    PADDLE_ENFORCE_NE(info_map.Has(n.first), true,
-                      platform::errors::AlreadyExists(
-                          "Operator (%s) has been registered.", type));
-    OpInfo info;
-    info.creator_ = n.second.creator_;
-
-    // If get the protocol buffer from dynamic library directly, there
-    // will be deconstruction error
-    // ** Error in `python`: free(): invalid pointer:
-    //  ...  paddle::framework::proto::OpDesc::SharedDtor()
-    // It seems a bug in protobuf, see
-    // https://github.com/protocolbuffers/protobuf/issues/435
-    // So, get the serialized binary string from dynamic library,
-    // then deserialize to protocol buffer.
-    info.grad_op_maker_ = [grad_op_desc_maker](
-        const OpDesc &op_desc,
-        const std::unordered_set<std::string> &no_grad_set,
-        std::unordered_map<std::string, std::string> *grad_to_var,
-        const std::vector<BlockDesc *> &grad_block) {
-      std::vector<std::string> strs =
-          grad_op_desc_maker(op_desc, no_grad_set, grad_to_var, grad_block);
-      std::vector<std::unique_ptr<OpDesc>> ret;
-      for (auto &str : strs) {
-        proto::OpDesc proto_desc;
-        PADDLE_ENFORCE_EQ(proto_desc.ParseFromString(str), true,
-                          platform::errors::InvalidArgument(
-                              "Failed to parse OpDesc from string."));
-        ret.emplace_back(new OpDesc(proto_desc, nullptr));
-      }
-      return ret;
-    };
-    info.proto_ = n.second.proto_;
-    info.checker_ = n.second.checker_;
-    info.infer_var_type_ = n.second.infer_var_type_;
-    info.infer_shape_ = n.second.infer_shape_;
-    info.infer_inplace_ = n.second.infer_inplace_;
-    info.infer_no_need_buffer_vars_ = n.second.infer_no_need_buffer_vars_;
-    info.use_default_grad_op_desc_maker_ =
-        n.second.use_default_grad_op_desc_maker_;
-    info.use_empty_grad_op_desc_maker_ = n.second.use_empty_grad_op_desc_maker_;
-
-    info_map.Insert(type, info);
-  }
-
-  typedef void init_device_t(platform::DeviceContextPool *);
-  init_device_t *init_dev =
-      DynLoad<init_device_t>(handle, "PD_InitDevicesPool");
-  init_dev(&(platform::DeviceContextPool::Instance()));
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index af657232e9..ddd84bfd81 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 912e82f60e..506c3eb1e0 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index b9ec550761..5ae8f255d6 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_version_proto.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e9ecf9b5a8..bf27a8e37e 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index cfef80b8d3..4ceb0c5c82 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/proto_desc.h"
 #include "paddle/fluid/platform/macros.h"
 
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index a4207deb7e..e7c23eab1f 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -20,7 +20,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 90a371e474..70266a2a3e 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -39,6 +39,7 @@ void SectionWorker::RunForward(
     int op_role = op->Attr<int>(std::string("op_role"));
     // We run op with op_role = kLRSched only for the first microbatch
     // to avoid increasing the @LR_DECAY_STEP@ multiple times.
+<<<<<<< HEAD
     bool run_first_mbatch = op_role == static_cast<int>(OpRole::kForward) ||
                             op_role == (static_cast<int>(OpRole::kForward) |
                                         static_cast<int>(OpRole::kLoss)) ||
@@ -46,6 +47,15 @@ void SectionWorker::RunForward(
     bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
                       op_role == (static_cast<int>(OpRole::kForward) |
                                   static_cast<int>(OpRole::kLoss));
+=======
+    bool run_first_mbatch = (op_role == static_cast<int>(OpRole::kForward)) ||
+                            (op_role == (static_cast<int>(OpRole::kForward) |
+                                         static_cast<int>(OpRole::kLoss))) ||
+                            (op_role == static_cast<int>(OpRole::kLRSched));
+    bool run_others = (op_role == static_cast<int>(OpRole::kForward)) ||
+                      (op_role == (static_cast<int>(OpRole::kForward) |
+                                   static_cast<int>(OpRole::kLoss)));
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
     if ((micro_id == 0 && run_first_mbatch) || (micro_id != 0 && run_others)) {
       VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch "
               << micro_id;
@@ -64,9 +74,15 @@ void SectionWorker::RunBackward(
         &unused_vars_) {
   for (auto &op : ops_) {
     int op_role = op->Attr<int>(std::string("op_role"));
+<<<<<<< HEAD
     if (op_role == static_cast<int>(OpRole::kBackward) ||
         op_role == (static_cast<int>(OpRole::kBackward) |
                     static_cast<int>(OpRole::kLoss))) {
+=======
+    if ((op_role == static_cast<int>(OpRole::kBackward)) ||
+        (op_role == (static_cast<int>(OpRole::kBackward) |
+                     static_cast<int>(OpRole::kLoss)))) {
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
       VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch "
               << micro_id;
       op->Run(*microbatch_scopes_[micro_id], place_);
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 8a127e0ed5..fd0f98784c 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/dlpack_tensor.h"
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 8affeda67b..2e35f9b845 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index b0d8f43a90..fc754cbaf1 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -21,7 +21,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h
index 6e65bc2c93..4cdfba2924 100644
--- a/paddle/fluid/framework/variable_helper.h
+++ b/paddle/fluid/framework/variable_helper.h
@@ -15,7 +15,6 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index deb504a1b6..b9df88b1f1 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -18,7 +18,6 @@
 #include <memory>
 #include <utility>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/imperative/layer.h"
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index d5a972fab3..14a1c3eea3 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <iosfwd>
 #include <string>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/helper.h"
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
index 368ef2e558..ede0402f81 100644
--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index ab49499351..cace420d87 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -25,7 +25,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 5e6960c4c7..fdfd2c60af 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -103,6 +103,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
                                         "merge_lod_tensor",
                                         "equal",
                                         "sequence_pool",
+                                        "recurrent",
                                         "lod_reset"};
     for (auto* tmp : node->inputs) {
       CHECK(tmp->IsOp());
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 8f2b217a2f..9a39510f02 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1192,7 +1192,14 @@ USE_TRT_CONVERTER(scale);
 USE_TRT_CONVERTER(stack);
 USE_TRT_CONVERTER(clip);
 USE_TRT_CONVERTER(gather);
-
+<<<<<<< HEAD
+
+=======
+USE_TRT_CONVERTER(yolo_box);
+USE_TRT_CONVERTER(roi_align);
+USE_TRT_CONVERTER(affine_channel);
+USE_TRT_CONVERTER(multiclass_nms);
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 USE_TRT_CONVERTER(nearest_interp);
 #endif
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 61fcdb7a90..1d77ddaf73 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -86,6 +86,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "simplify_with_basic_ops_pass",           //
       "embedding_eltwise_layernorm_fuse_pass",  //
       "multihead_matmul_fuse_pass_v2",          //
+      "multihead_matmul_fuse_pass_v3",          //
       "skip_layernorm_fuse_pass",               //
       "conv_bn_fuse_pass",                      //
       "unsqueeze2_eltwise_fuse_pass",           //
@@ -235,8 +236,8 @@ void CpuPassStrategy::EnableMKLDNN() {
              "reshape_transpose_matmul_mkldnn_fuse_pass",  //
              "matmul_transpose_reshape_fuse_pass",         //
              // Disabled due to topology-dependent speed-up
-             //"fc_mkldnn_pass",
-             //"fc_act_mkldnn_fuse_pass",
+             // "fc_mkldnn_pass",
+             // "fc_act_mkldnn_fuse_pass",
              "batch_norm_act_fuse_pass",
              // TODO(intel): Please fix the bug on windows.
              // https://github.com/PaddlePaddle/Paddle/issues/29710
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index c1bf4c974f..c4e195b6ec 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -207,13 +207,16 @@ int PD_GetOutputNum(const PD_Predictor* predictor) {
 }
 
 const char* PD_GetInputName(const PD_Predictor* predictor, int n) {
-  static std::vector<std::string> names = predictor->predictor->GetInputNames();
+  static std::vector<std::string> names;
+  names.resize(predictor->predictor->GetInputNames().size());
+  names[n] = predictor->predictor->GetInputNames()[n];
   return names[n].c_str();
 }
 
 const char* PD_GetOutputName(const PD_Predictor* predictor, int n) {
-  static std::vector<std::string> names =
-      predictor->predictor->GetOutputNames();
+  static std::vector<std::string> names;
+  names.resize(predictor->predictor->GetOutputNames().size());
+  names[n] = predictor->predictor->GetOutputNames()[n];
   return names[n].c_str();
 }
 
diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h
index 1a13ba5103..e29162cf5b 100644
--- a/paddle/fluid/inference/engine.h
+++ b/paddle/fluid/inference/engine.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <string>
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index b0d0229ec0..7ff3aa45c0 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -6,7 +6,14 @@ nv_library(tensorrt_converter
                 shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
                 gather_op.cc
+<<<<<<< HEAD
 
+=======
+                yolo_box_op.cc
+                roi_align_op.cc
+                affine_channel_op.cc
+                multiclass_nms_op.cc
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
                 nearest_interp_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
new file mode 100644
index 0000000000..813342c084
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Affine Channel Op
+ */
+class AffineChannelOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid affine_channel op to tensorrt scale nd layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("X").front();
+    std::string scale_name = op_desc.Input("Scale").front();
+    std::string bias_name = op_desc.Input("Bias").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    auto input_tensor = engine_->GetITensor(input_name);
+    auto idim = input_tensor->getDimensions();
+
+    auto* scale_v = scope.FindVar(scale_name);
+    auto* scale_t = scale_v->GetMutable<framework::LoDTensor>();
+    float* scale_ptr = engine_->GetWeightCPUData(scale_name, scale_t, false);
+
+    auto* bias_v = scope.FindVar(bias_name);
+    auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
+    float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t, false);
+
+    auto data_layout = framework::StringToDataLayout(
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
+
+    PADDLE_ENFORCE_EQ(
+        data_layout, framework::DataLayout::kNCHW,
+        platform::errors::InvalidArgument(
+            "TensorRT affine channel converter can only convert NCHW format. "
+            "Other format should be run in fluid mode. Report a bug on github "
+            "issue if you see this line."));
+
+    // tensorrt scalend layer only support spatial dims >= 2,
+    // so nhwc is not availabe (spatial dims == 0)
+    const int channel_axis = engine_->with_dynamic_shape();
+
+    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT,
+                                         static_cast<void*>(scale_ptr),
+                                         (size_t)idim.d[channel_axis]};
+    TensorRTEngine::Weight bias_weights{nvinfer1::DataType::kFLOAT,
+                                        static_cast<void*>(bias_ptr),
+                                        (size_t)idim.d[channel_axis]};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    auto layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *input_tensor,
+                                      nvinfer1::ScaleMode::kCHANNEL,
+                                      bias_weights.get(), scale_weights.get(),
+                                      power_weights.get(), channel_axis);
+
+    RreplenishLayerAndOutput(layer, "affine_channel", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(affine_channel, AffineChannelOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 26cd7b22d2..a6484a1355 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -158,17 +158,49 @@ class BatchNormOpConverter : public OpConverter {
     TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                          0};
 
-    nvinfer1::IScaleLayer* layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast<nvinfer1::ITensor*>(X),
-                             nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
-                             scale_weights.get(), power_weights.get());
+    int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0;
+    nvinfer1::ILayer* layer = nullptr;
+    nvinfer1::IShuffleLayer* expand_layer = nullptr;
+    nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
+
+    auto x_dim = X->getDimensions();
+    if (x_dim.nbDims < 3 + dynamic_shape_offset) {
+      nvinfer1::Dims expand_shape;
+      expand_shape.nbDims = 3 + dynamic_shape_offset;
+      for (int i = 0; i < 3 + dynamic_shape_offset; i++) {
+        if (i < x_dim.nbDims) {
+          expand_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i];
+        } else {
+          expand_shape.d[i] = 1;
+        }
+      }
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+      expand_layer->setReshapeDimensions(expand_shape);
+      X = expand_layer->getOutput(0);
+    }
+
+    layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *X, nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
+        scale_weights.get(), power_weights.get());
 
     auto output_name = op_desc.Output("Y").front();
     engine_->SetWeights(op_desc.Input("Bias").front(),
                         std::move(combile_bias_tensor));
     engine_->SetWeights(op_desc.Input("Scale").front(),
                         std::move(combile_scale_tensor));
-    RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
+    if (x_dim.nbDims < 3 + dynamic_shape_offset) {
+      nvinfer1::Dims squeeze_shape;
+      squeeze_shape.nbDims = x_dim.nbDims;
+      for (int i = 0; i < squeeze_shape.nbDims; i++) {
+        squeeze_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i];
+      }
+      squeeze_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
+      squeeze_layer->setReshapeDimensions(squeeze_shape);
+      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+    }
+    RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name},
+                             test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
new file mode 100644
index 0000000000..b0d67a5bf9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class MultiClassNMSOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid multiclassNMS op to tensorrt plugin";
+
+    // for now, only work for static shape and regular tensor
+    framework::OpDesc op_desc(op, nullptr);
+
+    std::string bboxes = op_desc.Input("BBoxes").front();
+    std::string scores = op_desc.Input("Scores").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    auto* bboxes_tensor = engine_->GetITensor(bboxes);
+    auto* scores_tensor = engine_->GetITensor(scores);
+
+    int background_label =
+        BOOST_GET_CONST(int, op_desc.GetAttr("background_label"));
+    float score_threshold =
+        BOOST_GET_CONST(float, op_desc.GetAttr("score_threshold"));
+    int nms_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("nms_top_k"));
+    float nms_threshold =
+        BOOST_GET_CONST(float, op_desc.GetAttr("nms_threshold"));
+    int keep_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("keep_top_k"));
+    bool normalized = BOOST_GET_CONST(bool, op_desc.GetAttr("normalized"));
+    int num_classes = scores_tensor->getDimensions().d[0];
+
+    auto bboxes_dims = bboxes_tensor->getDimensions();
+    nvinfer1::Dims3 bboxes_expand_dims(bboxes_dims.d[0], 1, bboxes_dims.d[1]);
+    auto* bboxes_expand_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor);
+    bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims);
+
+    nvinfer1::Permutation permutation{1, 0};
+    auto* scores_transpose_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor);
+    scores_transpose_layer->setFirstTranspose(permutation);
+
+    std::vector<nvinfer1::ITensor*> batch_nms_inputs;
+    batch_nms_inputs.push_back(bboxes_expand_layer->getOutput(0));
+    batch_nms_inputs.push_back(scores_transpose_layer->getOutput(0));
+
+    constexpr bool shareLocation = true;
+    constexpr bool clip_boxes = false;
+
+    const std::vector<nvinfer1::PluginField> fields{
+        {"shareLocation", &shareLocation, nvinfer1::PluginFieldType::kINT32, 1},
+        {"backgroundLabelId", &background_label,
+         nvinfer1::PluginFieldType::kINT32, 1},
+        {"numClasses", &num_classes, nvinfer1::PluginFieldType::kINT32, 1},
+        {"topK", &nms_top_k, nvinfer1::PluginFieldType::kINT32, 1},
+        {"keepTopK", &keep_top_k, nvinfer1::PluginFieldType::kINT32, 1},
+        {"scoreThreshold", &score_threshold,
+         nvinfer1::PluginFieldType::kFLOAT32, 1},
+        {"iouThreshold", &nms_threshold, nvinfer1::PluginFieldType::kFLOAT32,
+         1},
+        {"isNormalized", &normalized, nvinfer1::PluginFieldType::kINT32, 1},
+        {"clipBoxes", &clip_boxes, nvinfer1::PluginFieldType::kINT32, 1},
+    };
+
+    nvinfer1::PluginFieldCollection* plugin_collections =
+        static_cast<nvinfer1::PluginFieldCollection*>(
+            malloc(sizeof(*plugin_collections) +
+                   fields.size() * sizeof(nvinfer1::PluginField)));
+    plugin_collections->nbFields = static_cast<int>(fields.size());
+    plugin_collections->fields = fields.data();
+
+    auto creator = GetPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1");
+    auto batch_nms_plugin =
+        creator->createPlugin("BatchNMSPlugin", plugin_collections);
+    free(plugin_collections);
+
+    auto batch_nms_layer = engine_->network()->addPluginV2(
+        batch_nms_inputs.data(), batch_nms_inputs.size(), *batch_nms_plugin);
+    auto nmsed_boxes = batch_nms_layer->getOutput(1);
+    auto nmsed_scores = batch_nms_layer->getOutput(2);
+    auto nmsed_classes = batch_nms_layer->getOutput(3);
+
+    auto nmsed_scores_transpose_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_scores);
+    nmsed_scores_transpose_layer->setReshapeDimensions(
+        nvinfer1::Dims2(keep_top_k, 1));
+    auto nmsed_classes_reshape_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_classes);
+    nmsed_classes_reshape_layer->setReshapeDimensions(
+        nvinfer1::Dims2(keep_top_k, 1));
+
+    std::vector<nvinfer1::ITensor*> concat_inputs;
+    concat_inputs.push_back(nmsed_classes_reshape_layer->getOutput(0));
+    concat_inputs.push_back(nmsed_scores_transpose_layer->getOutput(0));
+    concat_inputs.push_back(nmsed_boxes);
+
+    auto nms_concat_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Concatenation, concat_inputs.data(), concat_inputs.size());
+    nms_concat_layer->setAxis(1);
+
+    RreplenishLayerAndOutput(nms_concat_layer, "multiclass_nms", {output_name},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(multiclass_nms, MultiClassNMSOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
new file mode 100644
index 0000000000..1329608aec
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Roi Align Op
+ */
+class RoiAlignOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid roi align op to tensorrt plugin";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("X").front();
+    std::string rois_name = op_desc.Input("ROIs").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    const auto pooled_height =
+        BOOST_GET_CONST(int, op_desc.GetAttr("pooled_height"));
+    const auto pooled_width =
+        BOOST_GET_CONST(int, op_desc.GetAttr("pooled_width"));
+    const auto spatial_scale =
+        BOOST_GET_CONST(float, op_desc.GetAttr("spatial_scale"));
+    const auto sampling_ratio =
+        BOOST_GET_CONST(int, op_desc.GetAttr("sampling_ratio"));
+
+    const auto input_tensor = engine_->GetITensor(input_name);
+    const auto rois_tensor = engine_->GetITensor(rois_name);
+
+    const nvinfer1::DataType data_type_ = engine_->WithFp16()
+                                              ? nvinfer1::DataType::kHALF
+                                              : nvinfer1::DataType::kFLOAT;
+
+    std::vector<nvinfer1::ITensor*> inputs{input_tensor, rois_tensor};
+    nvinfer1::ILayer* layer = nullptr;
+
+    PADDLE_ENFORCE_EQ(
+        engine_->with_dynamic_shape(), true,
+        platform::errors::InvalidArgument(
+            "TRT roi align plugin only accept the dynamic shape, because that "
+            "the roi_align will change the batch size."));
+
+    auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic(
+        data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio);
+    auto roi_align_layer = engine_->network()->addPluginV2(
+        inputs.data(), inputs.size(), *roi_align_plugin);
+    layer = roi_align_layer;
+
+    std::vector<std::string> output_names{output_name};
+    RreplenishLayerAndOutput(layer, "roi_align", output_names, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(roi_align, RoiAlignOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
new file mode 100644
index 0000000000..2d12eaf736
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class YoloBoxOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid yolo box op to tensorrt plugin";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string X = op_desc.Input("X").front();
+    std::string img_size = op_desc.Input("ImgSize").front();
+
+    auto* X_tensor = engine_->GetITensor(X);
+    auto* img_size_tensor = engine_->GetITensor(img_size);
+
+    int class_num = BOOST_GET_CONST(int, op_desc.GetAttr("class_num"));
+    std::vector<int> anchors =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("anchors"));
+
+    int downsample_ratio =
+        BOOST_GET_CONST(int, op_desc.GetAttr("downsample_ratio"));
+    float conf_thresh = BOOST_GET_CONST(float, op_desc.GetAttr("conf_thresh"));
+    bool clip_bbox = BOOST_GET_CONST(bool, op_desc.GetAttr("clip_bbox"));
+    float scale_x_y = BOOST_GET_CONST(float, op_desc.GetAttr("scale_x_y"));
+
+    int type_id = static_cast<int>(engine_->WithFp16());
+    auto input_dim = X_tensor->getDimensions();
+    auto* yolo_box_plugin = new plugin::YoloBoxPlugin(
+        type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
+        anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y,
+        input_dim.d[1], input_dim.d[2]);
+
+    std::vector<nvinfer1::ITensor*> yolo_box_inputs;
+    yolo_box_inputs.push_back(X_tensor);
+    yolo_box_inputs.push_back(img_size_tensor);
+
+    auto* yolo_box_layer = engine_->network()->addPluginV2(
+        yolo_box_inputs.data(), yolo_box_inputs.size(), *yolo_box_plugin);
+
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Boxes").front());
+    output_names.push_back(op_desc.Output("Scores").front());
+
+    RreplenishLayerAndOutput(yolo_box_layer, "yolo_box", output_names,
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(yolo_box, YoloBoxOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 11752d71a4..b66025f7a5 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -111,7 +111,14 @@ struct SimpleOpTypeSetTeller : public Teller {
       "flatten2",
       "flatten",
       "gather",
+<<<<<<< HEAD
 
+=======
+      "yolo_box",
+      "roi_align",
+      "affine_channel",
+      "multiclass_nms",
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
       "nearest_interp",
   };
 };
@@ -195,6 +202,57 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       // current not support axis from input, use default 0
       if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
     }
+<<<<<<< HEAD
+=======
+
+    if (op_type == "yolo_box") {
+      if (with_dynamic_shape) return false;
+      bool has_attrs =
+          (desc.HasAttr("class_num") && desc.HasAttr("anchors") &&
+           desc.HasAttr("downsample_ratio") && desc.HasAttr("conf_thresh") &&
+           desc.HasAttr("clip_bbox") && desc.HasAttr("scale_x_y"));
+      return has_attrs;
+    }
+
+    if (op_type == "affine_channel") {
+      if (!desc.HasAttr("data_layout")) return false;
+      auto data_layout = framework::StringToDataLayout(
+          BOOST_GET_CONST(std::string, desc.GetAttr("data_layout")));
+      if (data_layout != framework::DataLayout::kNCHW) return false;
+    }
+
+    if (op_type == "multiclass_nms") {
+      if (with_dynamic_shape) return false;
+      auto* block = desc.Block();
+      for (auto& param_name : desc.Inputs()) {
+        for (auto& var_name : param_name.second) {
+          auto* var_desc = block->FindVar(var_name);
+          const auto shape = var_desc->GetShape();
+          if (shape.size() != 3) {
+            VLOG(1) << "multiclass_nms op dims != 3 not supported in tensorrt, "
+                       "but got dims "
+                    << shape.size() << ", so jump it.";
+            return false;
+          }
+        }
+      }
+      bool has_attrs =
+          (desc.HasAttr("background_label") &&
+           desc.HasAttr("score_threshold") && desc.HasAttr("nms_top_k") &&
+           desc.HasAttr("keep_top_k") && desc.HasAttr("normalized"));
+      if (has_attrs == false) return false;
+
+      auto nms_top_k = BOOST_GET_CONST(int, desc.GetAttr("nms_top_k"));
+      if (nms_top_k < 0) return false;
+
+      auto keep_top_k = BOOST_GET_CONST(int, desc.GetAttr("keep_top_k"));
+      if (keep_top_k < 0) return false;
+
+      auto registry = GetPluginRegistry();
+      if (registry == nullptr) return false;
+    }
+
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
     if (op_type == "fc" || op_type == "mul") {
       const int x_num_col_dims =
           desc.HasAttr("x_num_col_dims")
@@ -206,6 +264,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
     }
+<<<<<<< HEAD
+=======
+
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
     if (op_type == "nearest_interp") {
       std::vector<std::string> attrs{"data_layout",   "interp_method",
                                      "align_corners", "scale",
@@ -223,6 +285,31 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (interp_method != "nearest") return false;
     }
 
+<<<<<<< HEAD
+=======
+    if (op_type == "roi_align") {
+      if (!with_dynamic_shape) return false;
+
+      std::vector<std::string> attrs{"pooled_height", "pooled_width",
+                                     "spatial_scale", "sampling_ratio"};
+      for (auto const attr : attrs) {
+        if (!desc.HasAttr(attr)) return false;
+      }
+
+      const auto pooled_height =
+          BOOST_GET_CONST(int, desc.GetAttr("pooled_height"));
+      if (pooled_height <= 0) return false;
+
+      const auto pooled_width =
+          BOOST_GET_CONST(int, desc.GetAttr("pooled_width"));
+      if (pooled_width <= 0) return false;
+
+      const auto spatial_scale =
+          BOOST_GET_CONST(float, desc.GetAttr("spatial_scale"));
+      if (spatial_scale <= 0.f) return false;
+    }
+
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 7ee16a598d..b4e948edd8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -5,6 +5,8 @@ nv_library(tensorrt_plugin
            instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
            qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
            hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu
+           yolo_box_op_plugin.cu
+           roi_align_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
new file mode 100644
index 0000000000..42c0df41a1
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
@@ -0,0 +1,380 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+
+#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+template <class T>
+__inline__ __device__ T BilinearInterpolate(const T* input_data,
+                                            const int height, const int width,
+                                            T y, T x) {
+  if (y < -1.f || y > height || x < -1.f || x > width) return 0;
+  y = y <= 0.f ? 0.f : y;
+  x = x <= 0.f ? 0.f : x;
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1.f - ly, hx = 1.f - lx;
+  T v1 = input_data[y_low * width + x_low];
+  T v2 = input_data[y_low * width + x_high];
+  T v3 = input_data[y_high * width + x_low];
+  T v4 = input_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T, typename OutT, bool USE_SMEM>
+__global__ void GPUROIAlignOpt(const int nthreads,
+                               const T* __restrict__ input_data,
+                               const T* __restrict__ input_rois,
+                               const float spatial_scale, const int channels,
+                               const int height, const int width,
+                               const int pooled_height, const int pooled_width,
+                               const int sampling_ratio, const int num_rois,
+                               OutT* __restrict__ output_data) {
+  const int batch = blockIdx.x;
+  const int channel = blockIdx.y;
+  const T* offset_input_data =
+      input_data + (batch * channels + channel) * height * width;
+  extern __shared__ T s_input_data[];
+  if (USE_SMEM) {
+    for (int idx = threadIdx.x; idx < height * width; idx += blockDim.x) {
+      s_input_data[idx] = offset_input_data[idx];
+    }
+    __syncthreads();
+  }
+  for (int idx = threadIdx.x; idx < num_rois * pooled_height * pooled_width;
+       idx += blockDim.x) {
+    const int pw = idx % pooled_width;
+    const int ph = (idx / pooled_width) % pooled_height;
+    const int roi_idx = (idx / pooled_width / pooled_height) % num_rois;
+    const int n = batch * num_rois + roi_idx;
+    const float4 rois_offset = reinterpret_cast<const float4*>(input_rois)[n];
+    const T roi_xmin = rois_offset.x * spatial_scale;
+    const T roi_ymin = rois_offset.y * spatial_scale;
+    const T roi_xmax = rois_offset.z * spatial_scale;
+    const T roi_ymax = rois_offset.w * spatial_scale;
+    const T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.f));
+    const T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.f));
+    const T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    const T bin_size_w = roi_width / static_cast<T>(pooled_width);
+    const int roi_bin_grid_h = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_height / pooled_height);
+    const int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+
+    T output_val = 0.f;
+    for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        if (USE_SMEM) {
+          T val = BilinearInterpolate<T>(s_input_data, height, width, y, x);
+          output_val += val;
+        } else {
+          T val =
+              BilinearInterpolate<T>(offset_input_data, height, width, y, x);
+          output_val += val;
+        }
+      }
+    }
+    output_val /= count;
+    const int out_offset =
+        batch * num_rois * channels * pooled_height * pooled_width +
+        roi_idx * channels * pooled_height * pooled_width +
+        channel * pooled_height * pooled_width + ph * pooled_width + pw;
+    output_data[out_offset] = static_cast<OutT>(output_val);
+  }
+}
+
+#if IS_TRT_VERSION_GE(6000)
+RoiAlignPluginDynamic::RoiAlignPluginDynamic(const nvinfer1::DataType data_type,
+                                             const int pooled_height,
+                                             const int pooled_width,
+                                             float spatial_scale,
+                                             int sampling_ratio)
+    : data_type_(data_type),
+      pooled_height_(pooled_height),
+      pooled_width_(pooled_width),
+      spatial_scale_(spatial_scale),
+      sampling_ratio_(sampling_ratio) {
+  bool data_type_is_valid = data_type_ == nvinfer1::DataType::kFLOAT ||
+                            data_type_ == nvinfer1::DataType::kHALF;
+  PADDLE_ENFORCE_EQ(data_type_is_valid, true,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts kFLOAT(%d) or "
+                        "kHALF(%d) data type, but the received data type = %d",
+                        static_cast<int>(nvinfer1::DataType::kFLOAT),
+                        static_cast<int>(nvinfer1::DataType::kHALF),
+                        static_cast<int>(data_type_)));
+
+  PADDLE_ENFORCE_GT(pooled_height_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts pooled_height "
+                        "greater than %d, but the received pooled_height = %d",
+                        0, pooled_height_));
+
+  PADDLE_ENFORCE_GT(pooled_width_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts pooled_width greater "
+                        "than %d, but the received pooled_width = %d",
+                        0, pooled_height_));
+
+  PADDLE_ENFORCE_GT(spatial_scale_, 0.f,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts spatial_scale "
+                        "greater than %f, but the received spatial_scale = %f",
+                        0, spatial_scale_));
+
+  int smem_per_block = -1;
+  int device = -1;
+  cudaGetDevice(&device);
+
+  PADDLE_ENFORCE_GE(
+      device, 0,
+      platform::errors::InvalidArgument(
+          "The cuda device ID should be greater than %d, but device ID is %d",
+          0, device));
+
+  cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
+                         device);
+  smem_per_block_ = smem_per_block;
+}
+
+RoiAlignPluginDynamic::RoiAlignPluginDynamic(void const* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &pooled_height_);
+  DeserializeValue(&data, &length, &pooled_width_);
+  DeserializeValue(&data, &length, &spatial_scale_);
+  DeserializeValue(&data, &length, &sampling_ratio_);
+  int smem_per_block = -1;
+  int device = -1;
+  cudaGetDevice(&device);
+  PADDLE_ENFORCE_GE(
+      device, 0,
+      platform::errors::InvalidArgument(
+          "The cuda device ID should be greater than %d, but device ID is %d",
+          0, device));
+  cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
+                         device);
+  smem_per_block_ = smem_per_block;
+}
+
+nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const {
+  auto* plugin =
+      new RoiAlignPluginDynamic(data_type_, pooled_height_, pooled_width_,
+                                spatial_scale_, sampling_ratio_);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs RoiAlignPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) {
+  nvinfer1::DimsExprs ret{};
+  ret.nbDims = 4;
+  ret.d[0] = inputs[1].d[0];  // roi
+  ret.d[1] = inputs[0].d[1];  // X
+  ret.d[2] = exprBuilder.constant(pooled_height_);
+  ret.d[3] = exprBuilder.constant(pooled_width_);
+  return ret;
+}
+
+bool RoiAlignPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) {
+  if (inOut[pos].format != nvinfer1::TensorFormat::kLINEAR) {
+    return false;
+  }
+  if (pos < 2) {  // input
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT;
+  }
+  return inOut[pos].type == data_type_;
+}
+
+void RoiAlignPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+size_t RoiAlignPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  return 0;
+}
+
+template <typename T, typename OutT>
+int RoiAlignPluginDynamic::enqueue_impl(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  auto in_dims = inputDesc[0].dims;
+  auto rois_dims = inputDesc[1].dims;
+  auto out_dims = outputDesc[0].dims;
+
+  int rois_num = rois_dims.d[0];
+  if (rois_num == 0) return cudaGetLastError() != cudaSuccess;
+
+  int batch = in_dims.d[0];
+  int channels = in_dims.d[1];
+  int height = in_dims.d[2];
+  int width = in_dims.d[3];
+
+  int output_size =
+      out_dims.d[0] * out_dims.d[1] * out_dims.d[2] * out_dims.d[3];
+
+  const dim3 blocks(batch, channels);
+  const int threads = 512;
+
+  if (smem_per_block_ < width * height * sizeof(T)) {
+    GPUROIAlignOpt<T, OutT, false><<<blocks, threads, 0, stream>>>(
+        output_size, static_cast<const T*>(inputs[0]),
+        static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
+        width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
+        static_cast<OutT*>(outputs[0]));
+  } else {
+    GPUROIAlignOpt<
+        T, OutT, true><<<blocks, threads, width * height * sizeof(T), stream>>>(
+        output_size, static_cast<const T*>(inputs[0]),
+        static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
+        width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
+        static_cast<OutT*>(outputs[0]));
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int RoiAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const* inputs,
+                                   void* const* outputs, void* workspace,
+                                   cudaStream_t stream) {
+  PADDLE_ENFORCE_EQ(outputDesc[0].type, data_type_,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlignPluginDynamic expects outputDesc[0].type "
+                        "equal to data_type_"));
+
+  if (data_type_ == nvinfer1::DataType::kHALF) {
+    return enqueue_impl<float, half>(inputDesc, outputDesc, inputs, outputs,
+                                     workspace, stream);
+  }
+  return enqueue_impl<float, float>(inputDesc, outputDesc, inputs, outputs,
+                                    workspace, stream);
+}
+
+nvinfer1::DataType RoiAlignPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+  return data_type_;
+}
+
+const char* RoiAlignPluginDynamic::getPluginType() const {
+  return "roi_align_plugin_dynamic";
+}
+
+int RoiAlignPluginDynamic::getNbOutputs() const { return 1; }
+
+int RoiAlignPluginDynamic::initialize() { return 0; }
+
+void RoiAlignPluginDynamic::terminate() {}
+
+size_t RoiAlignPluginDynamic::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(pooled_height_);
+  serialize_size += SerializedSize(pooled_width_);
+  serialize_size += SerializedSize(spatial_scale_);
+  serialize_size += SerializedSize(sampling_ratio_);
+  return serialize_size;
+}
+
+void RoiAlignPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, pooled_height_);
+  SerializeValue(&buffer, pooled_width_);
+  SerializeValue(&buffer, spatial_scale_);
+  SerializeValue(&buffer, sampling_ratio_);
+}
+
+void RoiAlignPluginDynamic::destroy() {}
+
+RoiAlignPluginDynamicCreator::RoiAlignPluginDynamicCreator() {}
+
+void RoiAlignPluginDynamicCreator::setPluginNamespace(
+    const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* RoiAlignPluginDynamicCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* RoiAlignPluginDynamicCreator::getPluginName() const {
+  return "roi_align_plugin_dynamic";
+}
+
+const char* RoiAlignPluginDynamicCreator::getPluginVersion() const {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+RoiAlignPluginDynamicCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+}
+
+nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new RoiAlignPluginDynamic(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
new file mode 100644
index 0000000000..bba7d0d5a9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class RoiAlignPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit RoiAlignPluginDynamic(const nvinfer1::DataType data_type,
+                                 const int pooled_height,
+                                 const int pooled_width, float spatial_scale,
+                                 int sampling_ratio);
+  RoiAlignPluginDynamic(void const* data, size_t length);
+  ~RoiAlignPluginDynamic() = default;
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  const char* getPluginType() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+
+ private:
+  template <typename T, typename OutT>
+  int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc,
+                   const nvinfer1::PluginTensorDesc* outputDesc,
+                   const void* const* inputs, void* const* outputs,
+                   void* workspace, cudaStream_t stream);
+
+  nvinfer1::DataType data_type_;
+  int pooled_height_;
+  int pooled_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  int smem_per_block_;
+  std::string namespace_;
+};
+
+class RoiAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  RoiAlignPluginDynamicCreator();
+  ~RoiAlignPluginDynamicCreator() override = default;
+
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+REGISTER_TRT_PLUGIN_V2(RoiAlignPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
new file mode 100644
index 0000000000..e1b4c898d2
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -0,0 +1,404 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <cassert>
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type,
+                             const std::vector<int>& anchors,
+                             const int class_num, const float conf_thresh,
+                             const int downsample_ratio, const bool clip_bbox,
+                             const float scale_x_y, const int input_h,
+                             const int input_w)
+    : data_type_(data_type),
+      class_num_(class_num),
+      conf_thresh_(conf_thresh),
+      downsample_ratio_(downsample_ratio),
+      clip_bbox_(clip_bbox),
+      scale_x_y_(scale_x_y),
+      input_h_(input_h),
+      input_w_(input_w) {
+  anchors_.insert(anchors_.end(), anchors.cbegin(), anchors.cend());
+  assert(data_type_ == nvinfer1::DataType::kFLOAT ||
+         data_type_ == nvinfer1::DataType::kHALF);
+  assert(class_num_ > 0);
+  assert(input_h_ > 0);
+  assert(input_w_ > 0);
+
+  cudaMalloc(&anchors_device_, anchors.size() * sizeof(int));
+  cudaMemcpy(anchors_device_, anchors.data(), anchors.size() * sizeof(int),
+             cudaMemcpyHostToDevice);
+}
+
+YoloBoxPlugin::YoloBoxPlugin(const void* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &anchors_);
+  DeserializeValue(&data, &length, &class_num_);
+  DeserializeValue(&data, &length, &conf_thresh_);
+  DeserializeValue(&data, &length, &downsample_ratio_);
+  DeserializeValue(&data, &length, &clip_bbox_);
+  DeserializeValue(&data, &length, &scale_x_y_);
+  DeserializeValue(&data, &length, &input_h_);
+  DeserializeValue(&data, &length, &input_w_);
+}
+
+YoloBoxPlugin::~YoloBoxPlugin() {
+  if (anchors_device_ != nullptr) {
+    cudaFree(anchors_device_);
+    anchors_device_ = nullptr;
+  }
+}
+
+const char* YoloBoxPlugin::getPluginType() const { return "yolo_box_plugin"; }
+
+const char* YoloBoxPlugin::getPluginVersion() const { return "1"; }
+
+int YoloBoxPlugin::getNbOutputs() const { return 2; }
+
+nvinfer1::Dims YoloBoxPlugin::getOutputDimensions(int index,
+                                                  const nvinfer1::Dims* inputs,
+                                                  int nb_input_dims) {
+  const int anchor_num = anchors_.size() / 2;
+  const int box_num = inputs[0].d[1] * inputs[0].d[2] * anchor_num;
+
+  assert(index <= 1);
+
+  if (index == 0) {
+    return nvinfer1::Dims2(box_num, 4);
+  }
+  return nvinfer1::Dims2(box_num, class_num_);
+}
+
+bool YoloBoxPlugin::supportsFormat(nvinfer1::DataType type,
+                                   nvinfer1::TensorFormat format) const {
+  return ((type == data_type_ || type == nvinfer1::DataType::kINT32) &&
+          format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+size_t YoloBoxPlugin::getWorkspaceSize(int max_batch_size) const { return 0; }
+
+template <typename T>
+__device__ inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <>
+__device__ inline float sigmoid(float x) {
+  return 1.f / (1.f + expf(-x));
+}
+
+template <typename T>
+__device__ inline void GetYoloBox(float* box, const T* x, const int* anchors,
+                                  int i, int j, int an_idx, int grid_size_h,
+                                  int grid_size_w, int input_size_h,
+                                  int input_size_w, int index, int stride,
+                                  int img_height, int img_width, float scale,
+                                  float bias) {
+  box[0] = static_cast<float>(
+      (i + sigmoid(static_cast<float>(x[index]) * scale + bias)) * img_width /
+      grid_size_w);
+  box[1] = static_cast<float>(
+      (j + sigmoid(static_cast<float>(x[index + stride]) * scale + bias)) *
+      img_height / grid_size_h);
+  box[2] = static_cast<float>(expf(static_cast<float>(x[index + 2 * stride])) *
+                              anchors[2 * an_idx] * img_width / input_size_w);
+  box[3] =
+      static_cast<float>(expf(static_cast<float>(x[index + 3 * stride])) *
+                         anchors[2 * an_idx + 1] * img_height / input_size_h);
+}
+
+__device__ inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
+                                    int an_num, int an_stride, int stride,
+                                    int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+template <typename T>
+__device__ inline void CalcDetectionBox(T* boxes, const float* box,
+                                        const int box_idx, const int img_height,
+                                        const int img_width, bool clip_bbox) {
+  float tmp_box_0, tmp_box_1, tmp_box_2, tmp_box_3;
+  tmp_box_0 = box[0] - box[2] / 2;
+  tmp_box_1 = box[1] - box[3] / 2;
+  tmp_box_2 = box[0] + box[2] / 2;
+  tmp_box_3 = box[1] + box[3] / 2;
+
+  if (clip_bbox) {
+    tmp_box_0 = max(tmp_box_0, 0.f);
+    tmp_box_1 = max(tmp_box_1, 0.f);
+    tmp_box_2 = min(tmp_box_2, static_cast<float>(img_width - 1));
+    tmp_box_3 = min(tmp_box_3, static_cast<float>(img_height - 1));
+  }
+
+  boxes[box_idx + 0] = static_cast<T>(tmp_box_0);
+  boxes[box_idx + 1] = static_cast<T>(tmp_box_1);
+  boxes[box_idx + 2] = static_cast<T>(tmp_box_2);
+  boxes[box_idx + 3] = static_cast<T>(tmp_box_3);
+}
+
+template <typename T>
+__device__ inline void CalcLabelScore(T* scores, const T* input,
+                                      const int label_idx, const int score_idx,
+                                      const int class_num, const float conf,
+                                      const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = static_cast<T>(
+        conf * sigmoid(static_cast<float>(input[label_idx + i * stride])));
+  }
+}
+
+template <typename T>
+__global__ void KeYoloBoxFw(const T* const input, const int* const imgsize,
+                            T* boxes, T* scores, const float conf_thresh,
+                            const int* anchors, const int n, const int h,
+                            const int w, const int an_num, const int class_num,
+                            const int box_num, int input_size_h,
+                            int input_size_w, bool clip_bbox, const float scale,
+                            const float bias) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  float box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+
+    int obj_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    float conf = sigmoid(static_cast<float>(input[obj_idx]));
+    int box_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+
+    if (conf < conf_thresh) {
+      for (int i = 0; i < 4; ++i) {
+        box[i] = 0.f;
+      }
+    } else {
+      GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
+                    input_size_w, box_idx, grid_num, img_height, img_width,
+                    scale, bias);
+    }
+
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
+
+    int label_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
+                      grid_num);
+  }
+}
+
+template <typename T>
+int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs,
+                                void** outputs, void* workspace,
+                                cudaStream_t stream) {
+  const int n = batch_size;
+  const int h = input_h_;
+  const int w = input_w_;
+  const int an_num = anchors_.size() / 2;
+  const int box_num = h * w * an_num;
+  int input_size_h = downsample_ratio_ * h;
+  int input_size_w = downsample_ratio_ * w;
+
+  float bias = -0.5 * (scale_x_y_ - 1.);
+  constexpr int threads = 256;
+
+  KeYoloBoxFw<T><<<(n * box_num + threads - 1) / threads, threads, 0, stream>>>(
+      reinterpret_cast<const T* const>(inputs[0]),
+      reinterpret_cast<const int* const>(inputs[1]),
+      reinterpret_cast<T*>(outputs[0]), reinterpret_cast<T*>(outputs[1]),
+      conf_thresh_, anchors_device_, n, h, w, an_num, class_num_, box_num,
+      input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int YoloBoxPlugin::enqueue(int batch_size, const void* const* inputs,
+                           void** outputs, void* workspace,
+                           cudaStream_t stream) {
+  if (data_type_ == nvinfer1::DataType::kFLOAT) {
+    return enqueue_impl<float>(batch_size, inputs, outputs, workspace, stream);
+  } else if (data_type_ == nvinfer1::DataType::kHALF) {
+    return enqueue_impl<half>(batch_size, inputs, outputs, workspace, stream);
+  }
+  assert("unsupported type.");
+}
+
+int YoloBoxPlugin::initialize() { return 0; }
+
+void YoloBoxPlugin::terminate() {}
+
+size_t YoloBoxPlugin::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(anchors_);
+  serialize_size += SerializedSize(class_num_);
+  serialize_size += SerializedSize(conf_thresh_);
+  serialize_size += SerializedSize(downsample_ratio_);
+  serialize_size += SerializedSize(clip_bbox_);
+  serialize_size += SerializedSize(scale_x_y_);
+  serialize_size += SerializedSize(input_h_);
+  serialize_size += SerializedSize(input_w_);
+  return serialize_size;
+}
+
+void YoloBoxPlugin::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, anchors_);
+  SerializeValue(&buffer, class_num_);
+  SerializeValue(&buffer, conf_thresh_);
+  SerializeValue(&buffer, downsample_ratio_);
+  SerializeValue(&buffer, clip_bbox_);
+  SerializeValue(&buffer, scale_x_y_);
+  SerializeValue(&buffer, input_h_);
+  SerializeValue(&buffer, input_w_);
+}
+
+void YoloBoxPlugin::destroy() {
+  cudaFree(anchors_device_);
+  delete this;
+}
+
+void YoloBoxPlugin::setPluginNamespace(const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* YoloBoxPlugin::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+nvinfer1::DataType YoloBoxPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* input_type, int nb_inputs) const {
+  return data_type_;
+}
+
+bool YoloBoxPlugin::isOutputBroadcastAcrossBatch(int output_index,
+                                                 const bool* input_is_broadcast,
+                                                 int nb_inputs) const {
+  return false;
+}
+
+bool YoloBoxPlugin::canBroadcastInputAcrossBatch(int input_index) const {
+  return false;
+}
+
+void YoloBoxPlugin::configurePlugin(
+    const nvinfer1::Dims* input_dims, int nb_inputs,
+    const nvinfer1::Dims* output_dims, int nb_outputs,
+    const nvinfer1::DataType* input_types,
+    const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
+    const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
+    int max_batct_size) {}
+
+nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const {
+  return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_,
+                           downsample_ratio_, clip_bbox_, scale_x_y_, input_h_,
+                           input_w_);
+}
+
+YoloBoxPluginCreator::YoloBoxPluginCreator() {}
+
+void YoloBoxPluginCreator::setPluginNamespace(const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* YoloBoxPluginCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* YoloBoxPluginCreator::getPluginName() const {
+  return "yolo_box_plugin";
+}
+
+const char* YoloBoxPluginCreator::getPluginVersion() const { return "1"; }
+
+const nvinfer1::PluginFieldCollection* YoloBoxPluginCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+
+  int type_id = -1;
+  std::vector<int> anchors;
+  int class_num = -1;
+  float conf_thresh = 0.01;
+  int downsample_ratio = 32;
+  bool clip_bbox = true;
+  float scale_x_y = 1.;
+  int h = -1;
+  int w = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string field_name(fc->fields[i].name);
+    if (field_name.compare("type_id") == 0) {
+      type_id = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("anchors")) {
+      const int length = fc->fields[i].length;
+      const int* data = static_cast<const int*>(fc->fields[i].data);
+      anchors.insert(anchors.end(), data, data + length);
+    } else if (field_name.compare("class_num")) {
+      class_num = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("conf_thresh")) {
+      conf_thresh = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("downsample_ratio")) {
+      downsample_ratio = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("clip_bbox")) {
+      clip_bbox = *static_cast<const bool*>(fc->fields[i].data);
+    } else if (field_name.compare("scale_x_y")) {
+      scale_x_y = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("h")) {
+      h = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("w")) {
+      w = *static_cast<const int*>(fc->fields[i].data);
+    } else {
+      assert(false && "unknown plugin field name.");
+    }
+  }
+
+  return new YoloBoxPlugin(
+      type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors,
+      class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, h, w);
+}
+
+nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new YoloBoxPlugin(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
new file mode 100644
index 0000000000..8ca21da7ae
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
@@ -0,0 +1,117 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class YoloBoxPlugin : public nvinfer1::IPluginV2Ext {
+ public:
+  explicit YoloBoxPlugin(const nvinfer1::DataType data_type,
+                         const std::vector<int>& anchors, const int class_num,
+                         const float conf_thresh, const int downsample_ratio,
+                         const bool clip_bbox, const float scale_x_y,
+                         const int input_h, const int input_w);
+  YoloBoxPlugin(const void* data, size_t length);
+  ~YoloBoxPlugin() override;
+
+  const char* getPluginType() const override;
+  const char* getPluginVersion() const override;
+  int getNbOutputs() const override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) override;
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::TensorFormat format) const override;
+  size_t getWorkspaceSize(int max_batch_size) const override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+  template <typename T>
+  int enqueue_impl(int batch_size, const void* const* inputs, void** outputs,
+                   void* workspace, cudaStream_t stream);
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_type,
+                                       int nb_inputs) const override;
+  bool isOutputBroadcastAcrossBatch(int output_index,
+                                    const bool* input_is_broadcast,
+                                    int nb_inputs) const override;
+  bool canBroadcastInputAcrossBatch(int input_index) const override;
+  void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs,
+                       const nvinfer1::Dims* output_dims, int nb_outputs,
+                       const nvinfer1::DataType* input_types,
+                       const nvinfer1::DataType* output_types,
+                       const bool* input_is_broadcast,
+                       const bool* output_is_broadcast,
+                       nvinfer1::PluginFormat float_format,
+                       int max_batct_size) override;
+  nvinfer1::IPluginV2Ext* clone() const override;
+
+ private:
+  nvinfer1::DataType data_type_;
+  std::vector<int> anchors_;
+  int* anchors_device_;
+  int class_num_;
+  float conf_thresh_;
+  int downsample_ratio_;
+  bool clip_bbox_;
+  float scale_x_y_;
+  int input_h_;
+  int input_w_;
+  std::string namespace_;
+};
+
+class YoloBoxPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  YoloBoxPluginCreator();
+  ~YoloBoxPluginCreator() override = default;
+
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(YoloBoxPluginCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 92f9c20a36..75628adbe8 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -530,7 +530,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
             
     set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz)
+    if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
@@ -538,7 +538,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
 
     set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR}/yolov3_r50_quant_aware.tgz)
+    if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc
@@ -576,8 +576,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
 
-    set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
@@ -585,8 +584,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
 
-    set(TEST_TRT_ERNIE_UNSER_FP16_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_fp16_unserialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_FP16_MODEL}/ernie_model_4_unserialized.tgz)
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz")
     endif()
 
diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
index 6d4bb70df6..9211ea246a 100644
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -75,14 +75,15 @@ int test_predictor_zero_copy(const AnalysisConfig& config_in,
   }
 
   std::vector<float> input({1});
-  auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())};
+  auto in_tensor =
+      predictor->GetInputTensor(predictor->GetInputNames().front());
   in_tensor->Reshape({1, 1});
   in_tensor->copy_from_cpu(input.data());
 
   predictor->ZeroCopyRun();
 
-  auto out_tensor{
-      predictor->GetOutputTensor(predictor->GetOutputNames().front())};
+  auto out_tensor =
+      predictor->GetOutputTensor(predictor->GetOutputNames().front());
   std::vector<float> data_o(10);
   out_tensor->copy_to_cpu(data_o.data());
 
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 7f871fab5a..6f252e1bd0 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -40,7 +40,7 @@ void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
                                                   platform::CPUPlace src_place,
                                                   const void* src, size_t num) {
   if (num <= 0) {
-    VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
+    VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
     return;
   }
   int dev_id = -1;
@@ -86,7 +86,7 @@ void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
                                                   platform::XPUPlace src_place,
                                                   const void* src, size_t num) {
   if (num <= 0) {
-    VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
+    VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
     return;
   }
   int dev_id = -1;
@@ -132,7 +132,7 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
                                                   platform::XPUPlace src_place,
                                                   const void* src, size_t num) {
   if (num <= 0) {
-    VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
+    VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
     return;
   }
   int dev_id = -1;
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 29498da0f0..69cf73fd5a 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -48,12 +48,20 @@ class BaseGPUFunctor {
 
 /* ===========================    relu forward   ============================ */
 template <typename T>
+<<<<<<< HEAD
 class ReluGPUFuctor : public BaseGPUFunctor<T> {
+=======
+class ReluGPUFunctor : public BaseGPUFunctor<T> {
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
  private:
   T zero_;
 
  public:
+<<<<<<< HEAD
   ReluGPUFuctor() { zero_ = static_cast<T>(0.0f); }
+=======
+  ReluGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
   // for relu forward when T is double
   __device__ __forceinline__ typename CudaVecType<T>::type Compute(
@@ -67,9 +75,15 @@ class ReluGPUFuctor : public BaseGPUFunctor<T> {
 
 template <>
 __device__ __forceinline__ CudaVecType<double>::type
+<<<<<<< HEAD
 ReluGPUFuctor<double>::Compute(const CudaVecType<double>::type* x) {
 // relu forward : out = max(x, 0)
 #ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
+=======
+ReluGPUFunctor<double>::Compute(const CudaVecType<double>::type* x) {
+// relu forward : out = max(x, 0)
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
   return __ldg(x) > zero_ ? __ldg(x) : zero_;
 #else
   return (*x) > zero_ ? (*x) : zero_;
@@ -78,7 +92,11 @@ ReluGPUFuctor<double>::Compute(const CudaVecType<double>::type* x) {
 
 template <>
 __device__ __forceinline__ CudaVecType<float>::type
+<<<<<<< HEAD
 ReluGPUFuctor<float>::Compute(const CudaVecType<float>::type* xx) {
+=======
+ReluGPUFunctor<float>::Compute(const CudaVecType<float>::type* xx) {
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
   // relu forward : out = max(xx, 0)
   return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y),
                      (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w));
@@ -86,9 +104,15 @@ ReluGPUFuctor<float>::Compute(const CudaVecType<float>::type* xx) {
 
 template <>
 __device__ __forceinline__ CudaVecType<float16>::type
+<<<<<<< HEAD
 ReluGPUFuctor<float16>::Compute(const CudaVecType<float16>::type* in) {
 // relu forward : out = max(in, 0)
 #ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
+=======
+ReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type* in) {
+// relu forward : out = max(in, 0)
+#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
   const half2 kzero = __float2half2_rn(0.0f);
   return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in));
 #else
@@ -117,7 +141,11 @@ class ReluGradGPUFunctor : public BaseGPUFunctor<T> {
 
   // when num % vecsize != 0 this func will be used
   __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) {
+<<<<<<< HEAD
     // relu backward : dx = out > 0 ? dout : 0;
+=======
+    // relu backward : dx = out > 0 ? dout : 0
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
     return out > zero_ ? dout : zero_;
   }
 
@@ -129,7 +157,11 @@ __device__ __forceinline__ CudaVecType<double>::type
 ReluGradGPUFunctor<double>::Compute(const CudaVecType<double>::type* out,
                                     const CudaVecType<double>::type* dout) {
 // relu backward : dx = out > 0 ? dout : 0;
+<<<<<<< HEAD
 #ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
+=======
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
   return __ldg(out) > zero_ ? __ldg(dout) : zero_;
 #else
   return (*out) > zero_ ? (*dout) : zero_;
@@ -151,7 +183,11 @@ __device__ __forceinline__ CudaVecType<float16>::type
 ReluGradGPUFunctor<float16>::Compute(const CudaVecType<float16>::type* out,
                                      const CudaVecType<float16>::type* dout) {
 // relu backward : dx = out > 0 ? dout : 0;
+<<<<<<< HEAD
 #ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
+=======
+#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
   const half2 kzero = __float2half2_rn(0.0f);
   return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout));
 #else
@@ -233,8 +269,14 @@ class ActivationGPUKernel
     Functor functor;
     constexpr int vecsize = CudaVecType<T>::vecsize;
     int grid = max((num / vecsize + block - 1) / block, 1);
+<<<<<<< HEAD
     ActivationkernelVec<T, Functor><<<grid, block>>>(input_data, output_data,
                                                      num, functor);
+=======
+    auto stream = context.cuda_device_context().stream();
+    ActivationkernelVec<T, Functor><<<grid, block, 0, stream>>>(
+        input_data, output_data, num, functor);
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
   }
 };
 
@@ -272,7 +314,12 @@ class ActivationGradGPUKernel
     Functor functor;
     constexpr int vecsize = CudaVecType<T>::vecsize;
     int grid = max((numel / vecsize + block - 1) / block, 1);
+<<<<<<< HEAD
     ActivationGradKernelVec<T, Functor><<<grid, block>>>(
+=======
+    auto stream = context.cuda_device_context().stream();
+    ActivationGradKernelVec<T, Functor><<<grid, block, 0, stream>>>(
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
         forward_data, dout_data, dx_data, numel, functor);
   }
 };
@@ -330,11 +377,19 @@ REGISTER_OP_CUDA_KERNEL(
 /* ===========================    relu register  ============================ */
 REGISTER_OP_CUDA_KERNEL(
     relu, ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
+<<<<<<< HEAD
                                    ops::ReluGPUFuctor<float>>,
     ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
                              ops::ReluGPUFuctor<double>>,
     ops::ActivationGPUKernel<plat::CUDADeviceContext,
                              ops::ReluGPUFuctor<plat::float16>>);
+=======
+                                   ops::ReluGPUFunctor<float>>,
+    ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
+                             ops::ReluGPUFunctor<double>>,
+    ops::ActivationGPUKernel<plat::CUDADeviceContext,
+                             ops::ReluGPUFunctor<plat::float16>>);
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
 REGISTER_OP_CUDA_KERNEL(
     relu_grad, ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index bc7def61b2..fb5c4db91e 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -400,7 +400,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
+    out.device(d) = x * (temp1 + temp2).template cast<T>();
   }
 };
 
@@ -417,7 +417,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 8fa0416049..cd60c7707c 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
 
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
index bbd43274a0..ca15858cf6 100644
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -23,8 +23,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename T>
+class XPUFPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class XPUFPTypeTrait<platform::float16> {
+ public:
+  using Type = float16;
+};
+
 template <typename DeviceContext, typename InT>
 class CastXPUKernel : public framework::OpKernel<InT> {
+  using XPUInTDType = typename XPUFPTypeTrait<InT>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
@@ -34,27 +48,39 @@ class CastXPUKernel : public framework::OpKernel<InT> {
     auto out_type = static_cast<framework::proto::VarType::Type>(
         context.Attr<int>("out_dtype"));
     auto* in_data = in->data<InT>();
+
+    // using XPUOutTDType = typename XPUFPTypeTrait<InT>::Type;
     auto numel = in->numel();
     auto& dev_ctx = context.template device_context<DeviceContext>();
     int r = -1;
     if (out_type == framework::proto::VarType::FP32) {
       auto* out_data = out->mutable_data<float>(context.GetPlace());
-      r = xpu::cast_v2<InT, float>(dev_ctx.x_context(), in_data, out_data,
-                                   numel);
+      r = xpu::cast_v2<XPUInTDType, float>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          out_data, numel);
     } else if (out_type == framework::proto::VarType::INT32) {
       auto* out_data = out->mutable_data<int>(context.GetPlace());
-      r = xpu::cast_v2<InT, int32_t>(dev_ctx.x_context(), in_data, out_data,
-                                     numel);
+      r = xpu::cast_v2<XPUInTDType, int32_t>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          out_data, numel);
     } else if (out_type == framework::proto::VarType::INT64) {
       auto* out_data = out->mutable_data<int64_t>(context.GetPlace());
-      r = xpu::cast_v2<InT, int64_t>(dev_ctx.x_context(), in_data, out_data,
-                                     numel);
+      r = xpu::cast_v2<XPUInTDType, int64_t>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          out_data, numel);
     } else if ((out_type == framework::proto::VarType::BOOL) &&
                (in_type == framework::proto::VarType::FP32)) {
       auto* out_data = out->mutable_data<bool>(context.GetPlace());
       r = xpu::cast_v2<float, int8_t>(
           dev_ctx.x_context(), (const float*)in_data,
           reinterpret_cast<int8_t*>(out_data), numel);
+    } else if (out_type == framework::proto::VarType::FP16) {
+      auto* out_data =
+          out->mutable_data<paddle::platform::float16>(context.GetPlace());
+      r = xpu::cast_v2<XPUInTDType, float16>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          reinterpret_cast<float16*>(out_data), numel);
+
     } else {
       PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d",
                                                  in_type, out_type));
@@ -75,5 +101,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
     cast, ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int32_t>,
     ops::CastXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext,
+                       paddle::platform::float16>,
     ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
 #endif
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index ddddb7f864..23a471cfa0 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -40,11 +40,6 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
-#else
-// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
-static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
-static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
-static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 #endif
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
index 3181e4b1d9..b7859237e7 100644
--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -85,20 +85,12 @@ class ScopedRNNBase {
     dropout_desc_.descriptor(handle, place, initialized_, dropout_prob_,
                              dropout_state, seed_, state_size);
 
-// ------------------- cudnn rnn descriptors ---------------------
-#if CUDNN_VERSION >= 6000
+    // ------------------- cudnn rnn descriptors ---------------------
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_.desc(), hidden_size_, num_layers_,
         dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
         CUDNN_RNN_ALGO_STANDARD, cudnn_type));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
-        CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        cudnn_type));
-#endif
 
 #if CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index 13a3e7d09b..a6a23a91c7 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -168,18 +168,11 @@ struct CudnnRNNCache {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
 
-#if CUDNN_VERSION >= 6000
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
         CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
         CUDNN_RNN_ALGO_STANDARD, cudnn_type));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        cudnn_type));
-#endif
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index 337a76f9f9..5977a434a6 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -45,7 +45,8 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
+        platform::errors::InvalidArgument(
+            "The polygon_box_transform operator needs to be executed on GPU."));
     auto* in = ctx.Input<Tensor>("Input");
     auto in_dims = in->dims();
     const T* in_data = in->data<T>();
diff --git a/paddle/fluid/operators/distributed_ops/recv_save_op.cc b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
index d194fcda36..d6da818e1d 100644
--- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 0b0b7f69b9..1b607922ed 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -160,7 +160,7 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
                   const Tensor* tensor_dout, Tensor* tensor_dx,
                   Tensor* tensor_dy,
                   const paddle::framework::ExecutionContext& ctx) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
     if (1 == tensor_dout->dims().size()) {
       auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index abfc88e515..4544386718 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -649,13 +649,18 @@ class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel {
                    "MovingAverageAbsMaxScale");
     OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
                    "MovingAverageAbsMaxScale");
+
     if (ctx->HasOutput("OutState")) {
       ctx->SetOutputDim("OutState", {1});
     }
     if (ctx->HasOutput("OutAccum")) {
       ctx->SetOutputDim("OutAccum", {1});
     }
-    ctx->SetOutputDim("OutScale", {1});
+    if (ctx->HasOutput("Out")) {
+      ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+      ctx->SetOutputDim("OutScale", {1});
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
   }
 
  protected:
@@ -673,6 +678,9 @@ class MovingAverageAbsMaxScaleOpMaker
     AddInput("X", "(Tensor) Input is float data type.");
     AddInput("InAccum", "Last accum.").AsDispensable();
     AddInput("InState", "Last state.").AsDispensable();
+    AddOutput("Out",
+              "(Tensor) Output tensor is just equivalent to the input tensor.")
+        .AsDispensable();
     AddOutput("OutScale", " Current scale");
     AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
     AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
@@ -693,7 +701,7 @@ $$Out = X$$
   }
 };
 
-class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
+class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -701,9 +709,9 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
     auto out_grad_name = framework::GradVarName("Out");
     auto x_grad_name = framework::GradVarName("X");
     OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
-                   "FakeQuantDequantGradOp");
+                   "StrightThroughEstimatorGradOp");
     OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
-                   "FakeQuantDequantGradOp");
+                   "StrightThroughEstimatorGradOp");
 
     ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name));
   }
@@ -717,13 +725,13 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
 };
 
 template <typename T>
-class FakeQuantDequantGradMaker : public framework::SingleGradOpMaker<T> {
+class StrightThroughEstimatorMaker : public framework::SingleGradOpMaker<T> {
  public:
   using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 
  protected:
   void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("fake_quantize_dequantize_grad");
+    grad_op->SetType("stright_throuth_estimator_grad");
     grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     grad_op->SetAttrMap(this->Attrs());
@@ -744,11 +752,11 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
                        ops::FakeQuantizeAbsMaxKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize_dequantize_abs_max,
-                  ops::FakeQuantOrWithDequantAbsMaxOp,
-                  ops::FakeQuantOrWithDequantAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    fake_quantize_dequantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp,
+    ops::FakeQuantOrWithDequantAbsMaxOpMaker,
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_abs_max,
                        ops::FakeQuantizeDequantizeAbsMaxKernel<CPU, float>);
 
@@ -769,11 +777,12 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
                        ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize_dequantize_moving_average_abs_max,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    fake_quantize_dequantize_moving_average_abs_max,
+    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
+    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     fake_quantize_dequantize_moving_average_abs_max,
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CPU, float>);
@@ -789,20 +798,22 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
 REGISTER_OPERATOR(
     moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleOp,
     ops::MovingAverageAbsMaxScaleOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
                        ops::MovingAverageAbsMaxScaleKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
-REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
-                       ops::FakeQuantDequantGradKernel<CPU, float>);
+REGISTER_OPERATOR(stright_throuth_estimator_grad,
+                  ops::StrightThroughEstimatorGradOp);
+REGISTER_OP_CPU_KERNEL(stright_throuth_estimator_grad,
+                       ops::StrightThroughEstimatorGradKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
-                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
-                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     fake_channel_wise_quantize_dequantize_abs_max,
     ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
@@ -820,4 +831,8 @@ REGISTER_OP_VERSION(moving_average_abs_max_scale)
             "Out",
             "Delete output in order to make the inference model not "
             "save moving_average_abs_max_scale operator. This will "
-            "make the quantitative model be correctly applied in inference."));
+            "make the quantitative model be correctly applied in inference."))
+    .AddCheckpoint(
+        R"ROC(Incompatible upgrade of output [Out])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewOutput(
+            "Out", "In order to support dygraph qat, add output again."));
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 92127f9aeb..78052179f6 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -543,8 +543,8 @@ REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale,
 REGISTER_OP_CUDA_KERNEL(
     fake_quantize_dequantize_moving_average_abs_max,
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
-                        ops::FakeQuantDequantGradKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(stright_throuth_estimator_grad,
+                        ops::StrightThroughEstimatorGradKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(
     fake_channel_wise_quantize_dequantize_abs_max,
     ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 94a75f930b..11a2d2de8b 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -314,6 +314,12 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
     auto* in = context.Input<framework::Tensor>("X");
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
+    if (context.HasOutput("Out")) {
+      auto* out = context.Output<framework::Tensor>("Out");
+      out->mutable_data<T>(context.GetPlace());
+      framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
+    }
+
     bool is_test = context.Attr<bool>("is_test");
     // testing
     if (is_test) {
@@ -344,17 +350,17 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
 };
 
 template <typename DeviceContext, typename T>
-class FakeQuantDequantGradKernel : public framework::OpKernel<T> {
+class StrightThroughEstimatorGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* d_out =
         context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
     auto x_grad_name = framework::GradVarName("X");
     auto* d_x = context.Output<framework::LoDTensor>(x_grad_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        d_x, platform::errors::PreconditionNotMet(
-                 "FakeQuantDequantGradOp doesn't have the output named %s.",
-                 x_grad_name));
+    PADDLE_ENFORCE_NOT_NULL(d_x, platform::errors::PreconditionNotMet(
+                                     "StrightThroughEstimatorGradKernel "
+                                     "doesn't have the output named %s.",
+                                     x_grad_name));
 
     // Initialize dx as same as d_out
     d_x->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 9711cc8d81..14a6608836 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -14,7 +14,6 @@
 
 #include <paddle/fluid/platform/device_context.h>
 #include <algorithm>
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 3c82be2c4e..6cca6b5a97 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -249,6 +249,18 @@ void FusionLSTMOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<float>("Scale_data",
+                 "Scale to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Shift_data",
+                 "Shift to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(0.0f);
+  AddAttr<std::vector<float>>("Scale_weights",
+                              "Scale_weights to be used for int8 weights data."
+                              "Only used with MKL-DNN INT8.")
+      .SetDefault({1.0f});
   AddAttr<bool>("force_fp32_output",
                 "(bool, default false) Force INT8 kernel output FP32, only "
                 "used in MKL-DNN INT8")
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index cf39968a90..1adbd5cd9e 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -79,13 +79,11 @@ class LSTMMKLDNNHandler
                                    MKLDNNMemoryFormat::ldgo);
       auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
                                      MKLDNNMemoryFormat::tnc);
+
       auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
                                  MKLDNNMemoryFormat::ldnc);
-      auto c0_md = MKLDNNMemDesc(
-          {L, D, N, OC}, MKLDNNGetDataType<float>(),  // Vanilla LSTM and LSTM
-                                                      // with peepoles has c0 as
-                                                      // fp32
-          MKLDNNMemoryFormat::ldnc);
+      auto c0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<float>(),
+                                 MKLDNNMemoryFormat::ldnc);
 
       // Create LSTM oneDNN primitive
       const auto direction =
@@ -266,7 +264,7 @@ class LSTMMKLDNNHandler
           this->fwd_pd_->src_iter_c_desc(), this->engine_);
 
       auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-      dnnl::reorder(user_c0_memory, *memory_p, this->attr_)
+      dnnl::reorder(user_c0_memory, *memory_p)
           .execute(astream, user_c0_memory, *memory_p);
 
       this->dev_ctx_.SetBlob(c0_key, memory_p);
@@ -360,6 +358,12 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
       weight_h_memory_p =
           handler.template AcquireWeightHMemory<paddle::platform::bfloat16>(
               weight_h);
+    } else {
+      h0_memory_p = handler.template AcquireH0Memory<uint8_t>(h0);
+      weight_x_memory_p =
+          handler.template AcquireWeightXMemory<int8_t>(weight_x);
+      weight_h_memory_p =
+          handler.template AcquireWeightHMemory<int8_t>(weight_h);
     }
 
     auto bias_memory_p = handler.AcquireBiasMemory(bias);
@@ -406,4 +410,5 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(fusion_lstm, MKLDNN, paddle::platform::CPUPlace,
                    ops::FusionLSTMMKLDNNKernel<float>,
-                   ops::FusionLSTMMKLDNNKernel<paddle::platform::bfloat16>);
+                   ops::FusionLSTMMKLDNNKernel<paddle::platform::bfloat16>,
+                   ops::FusionLSTMMKLDNNKernel<uint8_t>);
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 652c071be6..8234d63d68 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -16,7 +16,6 @@
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 419c4d44b6..a8e441a967 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -330,7 +330,10 @@ void BenchKernelSgd() {
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
     }
-    std::random_shuffle(all.begin(), all.end());
+    std::random_device rnd;
+    int64_t seed_tmp = rnd();
+    std::default_random_engine rng(seed_tmp);
+    std::shuffle(all.begin(), all.end(), rng);
     out.insert(out.begin(), all.begin(), all.begin() + n);
     return out;
   };
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index cfddbf213e..ff68565637 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -861,7 +861,10 @@ void TestKernelSgd() {
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
     }
-    std::random_shuffle(all.begin(), all.end());
+    std::random_device rnd;
+    int64_t seed_tmp = rnd();
+    std::default_random_engine rng(seed_tmp);
+    std::shuffle(all.begin(), all.end(), rng);
     out.insert(out.begin(), all.begin(), all.begin() + n);
     return out;
   };
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index a29997e565..d62c1e42d3 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -114,8 +114,8 @@ __global__ void ConcatKernel(const T** inputs_data, const int in_num,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int* out_cols,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t* out_cols,
                             int out_cols_size, T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   int curr_segment = 0;
@@ -159,15 +159,15 @@ __device__ void SplitKernelDetail(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T** outputs_data) {
   SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T* outputs_addr0, T* outputs_addr1) {
   T* outputs_data[2];
   outputs_data[0] = outputs_addr0;
@@ -176,8 +176,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T* outputs_addr0, T* outputs_addr1,
                             T* outputs_addr2) {
   T* outputs_data[3];
@@ -188,8 +188,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T* outputs_addr0, T* outputs_addr1,
                             T* outputs_addr2, T* outputs_addr3) {
   T* outputs_data[4];
@@ -201,8 +201,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
 }
 
 static inline void GetBlockDims(const platform::CUDADeviceContext& context,
-                                int num_rows, int num_cols, dim3* block_dims,
-                                dim3* grid_dims) {
+                                int64_t num_rows, int64_t num_cols,
+                                dim3* block_dims, dim3* grid_dims) {
   // Set the thread block and grid according to CurrentDeviceId
   const int kThreadsPerBlock = 1024;
   int block_cols = kThreadsPerBlock;
@@ -213,12 +213,12 @@ static inline void GetBlockDims(const platform::CUDADeviceContext& context,
   *block_dims = dim3(block_cols, block_rows, 1);
 
   int max_threads = context.GetMaxPhysicalThreadCount();
-  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+  int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
   int grid_cols =
       std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
-  int grid_rows =
-      std::min(max_blocks / grid_cols, std::max(num_rows / block_rows, 1));
+  int grid_rows = std::min(max_blocks / grid_cols,
+                           std::max(num_rows / block_rows, (int64_t)1));
   *grid_dims = dim3(grid_cols, grid_rows, 1);
 }
 
@@ -319,22 +319,22 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
                   int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     int o_num = outputs->size();
-    int out_row = 1;
+    int64_t out_row = 1;
     auto dim_0 = ref_inputs[0]->dims();
     for (int i = 0; i < axis; ++i) {
       out_row *= dim_0[i];
     }
 
-    int out0_col = ref_inputs[0]->numel() / out_row;
-    int in_col = 0, in_row = out_row;
+    int64_t out0_col = ref_inputs[0]->numel() / out_row;
+    int64_t in_col = 0, in_row = out_row;
     bool has_same_shape = true;
 
     std::vector<T*> outputs_data(o_num);
-    std::vector<int> outputs_cols(o_num + 1);
+    std::vector<int64_t> outputs_cols(o_num + 1);
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
-      int t_col = ref_inputs.at(i)->numel() / out_row;
+      int64_t t_col = ref_inputs.at(i)->numel() / out_row;
       if (has_same_shape) {
         if (t_col != out0_col) has_same_shape = false;
       }
@@ -384,13 +384,13 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context,
 
-                        outputs_cols.size() * sizeof(int));
+                        outputs_cols.size() * sizeof(int64_t));
       memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
                    tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
                    reinterpret_cast<void*>(outputs_cols.data()),
-                   outputs_cols.size() * sizeof(int), context.stream());
-      int* dev_outs_col_data =
-          reinterpret_cast<int*>(tmp_dev_ins_col_data->ptr());
+                   outputs_cols.size() * sizeof(int64_t), context.stream());
+      int64_t* dev_outs_col_data =
+          reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
 
       SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, dev_outs_col_data,
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 9b64e99c94..c12aecc9ba 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -587,7 +587,7 @@ class MatMulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_,
                       platform::errors::InvalidArgument(
                           "Input X's width should be equal to the Y's height, "
-                          "but received X's shape: [%s],"
+                          "but received X's shape: [%s], "
                           "Y's shape: [%s].",
                           dim_x, dim_y));
 #endif
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
index f92cff2f6c..6fa96aca4b 100644
--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-
 using framework::Tensor;
 
 static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
@@ -123,34 +122,47 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
       mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_;
     }
   }
-  PADDLE_ENFORCE_EQ(
-      mat_dim_a.width_, mat_dim_b.height_,
-      platform::errors::InvalidArgument("Shape mistake in matmul_op, the "
-                                        "first tensor width must be same as "
-                                        "second tensor height, but received "
-                                        "width:%d, height:%d",
-                                        mat_dim_a.width_, mat_dim_b.height_));
+
+  if (mat_dim_a.width_ == mat_dim_b.height_) {
+    if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) {
+      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
+    }
+    if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) {
+      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_,
+                    platform::errors::InvalidArgument(
+                        "Shape mistake in matmul_op, the "
+                        "first tensor width must be same as "
+                        "second tensor height, but received "
+                        "width:%d, height:%d x_dims = %s , y_dims = %s",
+                        mat_dim_a.width_, mat_dim_b.height_,
+                        x_dims.to_str().c_str(), y_dims.to_str().c_str()));
   PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_,
                     platform::errors::InvalidArgument(
                         "Shape mistake in matmul_op, the two input"
                         "tensor batch_size must be same, but received first "
                         "tensor batch_size:%d, second "
-                        "tensor batch_size:%d",
-                        mat_dim_a.batch_size_, mat_dim_b.batch_size_));
+                        "tensor batch_size:%d, x_dims = %s , y_dims = %s",
+                        mat_dim_a.batch_size_, mat_dim_b.batch_size_,
+                        x_dims.to_str().c_str(), y_dims.to_str().c_str()));
 
-  T alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+  float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
 
-  float *data_c = out->data<T>();
+  T *data_c = out->data<T>();
   int m = mat_dim_a.height_;
   int n = mat_dim_b.width_;
   int k = mat_dim_a.width_;
+  int batch_size = mat_dim_a.batch_size_;
+
   int ldx = mat_dim_a.trans_ ? m : k;
   int ldy = mat_dim_b.trans_ ? k : n;
   int ldout = n;
-  int batch_size = mat_dim_a.batch_size_;
-
-  if (batch_size == 0) {
-    int r = xpu::fc_fusion<float, float, float, FCT>(
+  if (batch_size <= 1) {
+    int r = 0;
+    r = xpu::fc_fusion<T, T, T, FCT>(
         dev_ctx.x_context(), x->data<T>(), y->data<T>(), data_c, m, n, k,
         mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy,
         ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR);
@@ -159,14 +171,32 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
                           "XPU fc_fusion kernel return wrong value[%d %s]", r,
                           XPUAPIErrorMsg[r]));
   } else {
-    int r = xpu::fc_batched<float, float, float, FCT>(
-        dev_ctx.x_context(), batch_size, mat_dim_a.trans_, mat_dim_b.trans_, m,
-        n, k, alpha, x->data<T>(), mat_dim_a.stride_, y->data<T>(),
-        mat_dim_b.stride_, 0.0, data_c, m * n, nullptr, nullptr);
+    // batch matmul
+    int r = xpu::fc_batched<T, T, T, FCT>(
+        dev_ctx.x_context(),                        // Context* ctx,
+        batch_size,                                 // int batch_size,
+        mat_dim_a.trans_,                           // bool x_trans,
+        mat_dim_b.trans_,                           // bool w_trans,
+        m,                                          // int m,
+        n,                                          // int n,
+        k,                                          // int k,
+        alpha,                                      // float alpha,
+        reinterpret_cast<const T *>(x->data<T>()),  // const TX* x,
+        mat_dim_a.stride_,                          // int stride_a,
+        reinterpret_cast<const T *>(y->data<T>()),  // const TW* w,
+        mat_dim_b.stride_,                          // int stride_b,
+        0.0,                                        // float beta,
+        reinterpret_cast<T *>(data_c),              // TY* y,
+        m * n,                                      // int stride_c,
+        nullptr,                                    // const float* x_maxptr,
+        nullptr);                                   // const float* w_maxptr
+
     PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                       platform::errors::External(
-                          "XPU fc_batched kernel return wrong value[%d %s]", r,
-                          XPUAPIErrorMsg[r]));
+                          "XPU fc_batched kernel return wrong value[%d %s] "
+                          "x_dims = %s , y_dims = %s",
+                          r, XPUAPIErrorMsg[r], x_dims.to_str().c_str(),
+                          y_dims.to_str().c_str()));
   }
 }
 
@@ -206,9 +236,8 @@ static framework::Tensor XPUFoldHeadAndLastDims(
                                     static_cast<int>(in_dims[1]),
                                     static_cast<int>(in_dims[2])};
   std::vector<int> axis_host = {1, 0, 2};
-
   int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
-                         in_shape_host.data(), axis_host.data(), /*ndims=*/3);
+                         in_shape_host, axis_host);
   PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                     platform::errors::External(
                         "XPU transpose kernel return wrong value[%d %s]", r,
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index dbb1d7bfb0..d992ef847d 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -57,32 +57,55 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
 
   PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_,
                     platform::errors::InvalidArgument(
-                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s",
-                        x_dims.to_str(), y_dims.to_str()));
+                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s "
+                        "x_trans = %d y_trans = %d",
+                        x_dims.to_str(), y_dims.to_str(), mat_dim_a.trans_,
+                        mat_dim_b.trans_));
   PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_,
                     platform::errors::InvalidArgument(
-                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s",
-                        x_dims.to_str(), y_dims.to_str()));
+                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s "
+                        "x_trans = %d y_trans = %d",
+                        x_dims.to_str(), y_dims.to_str(), mat_dim_a.trans_,
+                        mat_dim_b.trans_));
 
-  float* data_c = out->data<T>();
+  T* data_c = out->data<T>();
   int m = mat_dim_a.height_;
   int n = mat_dim_b.width_;
   int k = mat_dim_a.width_;
   int batch_size = mat_dim_a.batch_size_;
-
-  if (batch_size == 0) {
-    int r = xpu::fc<float, float, float, FCT>(
-        dev_ctx.x_context(), x->data<T>(), y->data<T>(), data_c, m, n, k,
-        mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU fc_fusion kernel return wrong value[%d %s]", r,
-                          XPUAPIErrorMsg[r]));
+  if (batch_size <= 1) {
+    int r = 0;
+    r = xpu::fc<T, T, T, FCT>(dev_ctx.x_context(), x->data<T>(), y->data<T>(),
+                              data_c, m, n, k, mat_dim_a.trans_,
+                              mat_dim_b.trans_, nullptr, nullptr, nullptr);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU fc_fusion kernel return wrong value[%d %s] , m = %d, n = "
+            "%d, "
+            "k = %d, a_tr = %d, b_tr = %d",
+            r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_));
   } else {
-    int r = xpu::fc_batched<float, float, float, FCT>(
-        dev_ctx.x_context(), batch_size, mat_dim_a.trans_, mat_dim_b.trans_, m,
-        n, k, 1.0, x->data<T>(), mat_dim_a.stride_, y->data<T>(),
-        mat_dim_b.stride_, 0.0, data_c, m * n, nullptr, nullptr);
+    // batch matmul
+    int r = xpu::fc_batched<T, T, T, FCT>(
+        dev_ctx.x_context(),                       // Context* ctx,
+        batch_size,                                // int batch_size,
+        mat_dim_a.trans_,                          // bool x_trans,
+        mat_dim_b.trans_,                          // bool w_trans,
+        m,                                         // int m,
+        n,                                         // int n,
+        k,                                         // int k,
+        1.0,                                       // float alpha,
+        reinterpret_cast<const T*>(x->data<T>()),  // const TX* x,
+        mat_dim_a.stride_,                         // int stride_a,
+        reinterpret_cast<const T*>(y->data<T>()),  // const TW* w,
+        mat_dim_b.stride_,                         // int stride_b,
+        0.0,                                       // float beta,
+        reinterpret_cast<T*>(data_c),              // TY* y,
+        m * n,                                     // int stride_c,
+        nullptr,                                   // const float* x_maxptr,
+        nullptr);                                  // const float* w_maxptr
+
     PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                       platform::errors::External(
                           "XPU fc_batched kernel return wrong value[%d %s]", r,
@@ -125,7 +148,7 @@ static framework::Tensor XPUFoldHeadAndLastDims(
   std::vector<int> axis_host = {1, 0, 2};
 
   int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
-                         in_shape_host.data(), axis_host.data(), /*ndims=*/3);
+                         in_shape_host, axis_host);
   PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                     platform::errors::External(
                         "XPU transpose kernel return wrong value[%d %s]", r,
@@ -189,6 +212,7 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
     auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
     auto* dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
     ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+
     framework::DDim dx_dims;
     if (dx) {
       dx_dims = dx->dims();
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 4beb7ad017..df1b5af121 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -71,6 +71,15 @@ static const std::vector<const Tensor*> ReduceMultiInput(
   return reduced;
 }
 
+static const std::vector<int> GetDimsForKey(
+    const std::vector<const Tensor*>& inputs) {
+  auto dims_key = paddle::framework::vectorize<int>(inputs[0]->dims());
+  for (auto it = std::next(inputs.begin()); it != inputs.end(); ++it) {
+    dims_key.push_back((*it)->dims()[0]);
+  }
+  return dims_key;
+}
+
 template <typename T>
 class ConcatPrimitiveFactory {
  public:
@@ -134,6 +143,8 @@ template <typename T>
 class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    // If any of the multiple inputs of concat has an input size of 0, the
+    // actual size of the multi_input will change
     auto multi_input = ReduceMultiInput(ctx.MultiInput<Tensor>("X"));
     EnforceLayouts(multi_input);
     Tensor* output = ctx.Output<Tensor>("Out");
@@ -156,12 +167,9 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::ToMKLDNNDataType(multi_input[0]->type());
 
     ConcatPrimitiveFactory<T> prim_creator;
-    // If one of the multiple inputs of concat has an input size of 0, the
-    // actual size of the multi_input will change
-    std::string key = platform::CreateKey(
-        dev_ctx, paddle::framework::vectorize<int>(multi_input[0]->dims()),
-        multi_input.size(), ctx.OutputName("Out"), dt,
-        platform::ThreadIDasStr());
+    std::string key =
+        platform::CreateKey(dev_ctx, GetDimsForKey(multi_input),
+                            multi_input.size(), ctx.OutputName("Out"), dt);
     key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
 
     const std::string key_prim = key + "@concat_p";
diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h
index e93d579220..be6f4422d4 100644
--- a/paddle/fluid/operators/nll_loss_op.h
+++ b/paddle/fluid/operators/nll_loss_op.h
@@ -36,7 +36,10 @@ static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data,
       }
       PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
                         platform::errors::InvalidArgument(
-                            "label should not be out of bounds."));
+                            "Label value is out of range. "
+                            "Expected label value in range of [0, %d), but "
+                            "received value is %d.",
+                            n_classes, cur_label));
 
       const auto cur_weight =
           weight_data ? weight_data[cur_label] : static_cast<T>(1);
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index 9c321832f8..64323e588c 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/operators/one_hot_op.h"
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/one_hot_op_xpu.cc b/paddle/fluid/operators/one_hot_op_xpu.cc
index 14ecd11d11..3e214aa8bf 100644
--- a/paddle/fluid/operators/one_hot_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_op_xpu.cc
@@ -16,7 +16,6 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/one_hot_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
index 29fe6f10c7..c42db1e6f4 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/operators/one_hot_v2_op.h"
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/one_hot_v2_op_xpu.cc b/paddle/fluid/operators/one_hot_v2_op_xpu.cc
index 6fec597db1..e24be3bead 100644
--- a/paddle/fluid/operators/one_hot_v2_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_xpu.cc
@@ -16,7 +16,6 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/one_hot_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index 1740f2982b..3baba424e8 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -121,19 +121,25 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
       } else {
         T cpu_beta1_pow_out_data;
         T cpu_beta2_pow_out_data;
-        xpu_memcpy(&cpu_beta1_pow_out_data, beta1_pow_ptr, sizeof(T),
-                   XPU_DEVICE_TO_HOST);
+        memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
+                     BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
+                     beta1_pow_ptr, sizeof(T));
+
         cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
-        xpu_memcpy(&cpu_beta2_pow_out_data, beta2_pow_ptr, sizeof(T),
-                   XPU_DEVICE_TO_HOST);
+        memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
+                     BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
+                     beta2_pow_ptr, sizeof(T));
+
         cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
 
         T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
         T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-        xpu_memcpy(beta1_pow_out_p, &cpu_beta1_pow_out_data, sizeof(T),
-                   XPU_HOST_TO_DEVICE);
-        xpu_memcpy(beta2_pow_out_p, &cpu_beta2_pow_out_data, sizeof(T),
-                   XPU_HOST_TO_DEVICE);
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                     beta1_pow_out_p, platform::CPUPlace(),
+                     &cpu_beta1_pow_out_data, sizeof(T));
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                     beta2_pow_out_p, platform::CPUPlace(),
+                     &cpu_beta2_pow_out_data, sizeof(T));
       }
 
       PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index c04bdb2f10..a7d177f326 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/reader/py_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 9086291e17..38894495b4 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/platform/profiler.h"
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 9766008963..92e5e4a0cd 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -210,9 +210,10 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
   auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
   auto *program = block->Program();
-  auto ctx = executor.Prepare(
-      *program, block->ID(), Attr<std::vector<std::string>>(
-                                 kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);
+  auto ctx = executor.Prepare(*program, block->ID(),
+                              Attr<std::vector<std::string>>(
+                                  kSkipEagerDeletionVars), /*skip_ref_cnt_vars*/
+                              true);
 
   static std::mutex mutex;
   std::lock_guard<std::mutex> lock(mutex);
@@ -255,16 +256,6 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
     // Link inside::output -> outside::output
     //   outside::output[seq_offset: seq_offset + 1] = inside::output
     executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_);
-    if (i > 0) {
-      LinkTensorWithCallback(scope, Outputs(kOutputs), cur_scope,
-                             Outputs(kOutputs),
-                             [&](const framework::LoDTensor &src_tensor,
-                                 framework::LoDTensor *dst_tensor) {
-                               framework::Tensor src_slice =
-                                   src_tensor.Slice(seq_offset, seq_offset + 1);
-                               dst_tensor->ShareDataWith(src_slice);
-                             });
-    }
 
     // Linked now, execute!
     executor.RunPreparedContext(ctx.get(), &cur_scope,
@@ -284,6 +275,14 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
             // early.
             framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
           });
+    } else {
+      LinkTensorWithCallback(
+          cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs),
+          [&](const framework::LoDTensor &src_tensor,
+              framework::LoDTensor *dst_tensor) {
+            auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
+            framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
+          });
     }
 
     scopes.ForwardNext();
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 94efa70e46..e119a21caa 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -377,31 +377,9 @@ class ReshapeKernel {
 
     out->Resize(out_dims);
     out->mutable_data(ctx.GetPlace(), in->type());
-
-#ifdef PADDLE_WITH_XPU
-    if (platform::is_xpu_place(ctx.GetPlace())) {
-      void *out_ptr = out->data<void>();
-      const void *in_ptr = in->data<void>();
-      if ((out_ptr != nullptr) && (in_ptr != nullptr) &&
-          (paddle::framework::SizeOfType(in->type()) > 0)) {
-        auto &dev_ctx =
-            ctx.template device_context<paddle::platform::XPUDeviceContext>();
-        int r = xpu::memcpy_device(
-            dev_ctx.x_context(), out_ptr, in_ptr,
-            in->numel() * paddle::framework::SizeOfType(in->type()));
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                          platform::errors::External(
-                              "XPU memcpy_device return wrong value[%d %s]", r,
-                              XPUAPIErrorMsg[r]));
-      }
-    } else {
-#endif
-      framework::TensorCopy(
-          *in, ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(), out);
-#ifdef PADDLE_WITH_XPU
-    }
-#endif
+    framework::TensorCopy(
+        *in, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), out);
     out->Resize(out_dims);
   }
 };
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
index 0246c42d43..939768693a 100644
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
index fbde722a42..e44a5c77bd 100644
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 94d34c648d..105d61015f 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -124,6 +124,9 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int64_t>>(
         "steps", "(list<int64_t>) Stride step from the start to the end.")
         .SetDefault({});
+    AddAttr<std::vector<int64_t>>("decrease_axes",
+                                  "(list<int>) The axes to decrease.")
+        .SetDefault({});
 
     AddAttr<std::vector<int>>("bool_values", "Store the bool values.")
         .SetDefault({});
@@ -185,4 +188,10 @@ Upgrade set_value, add 3 inputs [StartsTensorList, EndsTensorList, StepsTensorLi
                         "Ending indices of corresponding axis in `axes`.",
                         std::vector<int64_t>{})
             .NewAttr("steps", "Stride step from the start to the end.",
-                     std::vector<int64_t>{}));
+                     std::vector<int64_t>{}))
+    .AddCheckpoint(
+        R"ROC(
+Upgrade set_value, add 1 attribute [decrease_axes].
+              )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "decrease_axes", "The axes to decrease.", std::vector<int64_t>{}));
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 325a2b0b86..eca51147f8 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -106,10 +106,10 @@ inline void CheckAndUpdateSlice(const framework::DDim in_dims,
 }
 
 inline framework::DDim GetSliceDims(const framework::DDim in_dims,
-                                    const std::vector<int64_t> axes,
-                                    const std::vector<int64_t> starts,
-                                    const std::vector<int64_t> ends,
-                                    const std::vector<int64_t> steps) {
+                                    const std::vector<int64_t>& axes,
+                                    const std::vector<int64_t>& starts,
+                                    const std::vector<int64_t>& ends,
+                                    const std::vector<int64_t>& steps) {
   framework::DDim slice_dims(in_dims);
 
   for (size_t i = 0; i < axes.size(); ++i) {
@@ -127,6 +127,38 @@ inline framework::DDim GetSliceDims(const framework::DDim in_dims,
   return slice_dims;
 }
 
+inline framework::DDim GetDecreasedDims(
+    const framework::DDim slice_dims,
+    const std::vector<int64_t>& decrease_axes) {
+  // Get dims after decreasing axes.
+  framework::DDim decreased_dims(slice_dims);
+  if (decrease_axes.size() > 0) {
+    for (size_t i = 0; i < decrease_axes.size(); ++i) {
+      int64_t axis = decrease_axes[i];
+      PADDLE_ENFORCE_EQ(
+          decreased_dims[axis], 1,
+          platform::errors::InvalidArgument("decrease dim should be 1"));
+      decreased_dims[axis] = 0;
+    }
+
+    std::vector<int64_t> new_shape;
+    for (int i = 0; i < decreased_dims.size(); ++i) {
+      if (decreased_dims[i] != 0) {
+        new_shape.push_back(decreased_dims[i]);
+      }
+    }
+
+    // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
+    // uses [1] instead.
+    if (new_shape.size() == 0) {
+      new_shape.push_back(1);
+    }
+
+    decreased_dims = framework::make_ddim(new_shape);
+  }
+  return decreased_dims;
+}
+
 template <typename DeviceContext, typename T>
 class SetValueKernel : public framework::OpKernel<T> {
  public:
@@ -179,6 +211,7 @@ class SetValueKernel : public framework::OpKernel<T> {
     auto ends = ctx.Attr<std::vector<int64_t>>("ends");
     auto steps = ctx.Attr<std::vector<int64_t>>("steps");
     auto shape = ctx.Attr<std::vector<int64_t>>("shape");
+    auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
 
     auto dtype = in->type();
     if (!starts_tensor_list.empty()) {
@@ -194,6 +227,7 @@ class SetValueKernel : public framework::OpKernel<T> {
     auto in_dims = in->dims();
     CheckAndUpdateSlice(in_dims, axes, &starts, &ends, &steps);
     auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, steps);
+    auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
 
     auto place = ctx.GetPlace();
     auto& eigen_place =
@@ -212,13 +246,13 @@ class SetValueKernel : public framework::OpKernel<T> {
     // set_value is what we want.
     TensorCopy(*in, place, out);
 
-    Tensor slice_t(dtype), pad_t(dtype);
-    slice_t.mutable_data<T>(slice_dims, place);
-    pad_t.mutable_data<T>(in_dims, place);
+    Tensor slice_tensor(dtype), pad_tensor(dtype);
+    slice_tensor.mutable_data<T>(slice_dims, place);
+    pad_tensor.mutable_data<T>(in_dims, place);
 
-    auto pad_e = framework::EigenTensor<T, D>::From(pad_t, in_dims);
+    auto pad_e = framework::EigenTensor<T, D>::From(pad_tensor, in_dims);
     auto out_e = framework::EigenTensor<T, D>::From(*out);
-    auto slice_e = framework::EigenTensor<T, D>::From(slice_t, slice_dims);
+    auto slice_e = framework::EigenTensor<T, D>::From(slice_tensor, slice_dims);
 
     // Step 1: Set the value of out at `_index` to zero
     slice_e.device(eigen_place) = slice_e.constant(T(0));
@@ -244,11 +278,26 @@ class SetValueKernel : public framework::OpKernel<T> {
 
     // Step 2: Set a tensor with the same shape as out tensor. And its data at
     // '_index' is the same as value_tensor, and data out of '_index' to zero
+
     // - Step 2.1 Set slice tensor with value
+
+    // NOTE(liym27): [ Why resize slice_tensor here? ]
+    // A: When do broadcasting on slice_tensor and value_tensor, the shape of
+    // slice_tensor should be decreased dims.
+    // e.g.
+    //  x[:,0] = value_tensor
+    // x's shape = [3, 4], value_tensor's shape = [3]
+    // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
+    // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
+    // shape is [3, 3], which cross the border;
+    // If do broadcasting on Tensor with shape [3] and [3], the result's shape
+    // is [3], which is right.
+
+    slice_tensor.Resize(decrease_slice_dims);
     if (value_tensor != nullptr) {
       // ElementwiseComputeEx can do broadcasting
       ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_t, value_tensor, -1, SubFunctor<T>(), &slice_t);
+          ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
     } else {
       Tensor value_t(dtype);
       auto value_dims = framework::make_ddim(shape);
@@ -257,8 +306,9 @@ class SetValueKernel : public framework::OpKernel<T> {
       CopyVecotorToTensor<T>(value_name.c_str(), &value_t, ctx);
       value_t.Resize(value_dims);
       ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_t, &value_t, -1, SubFunctor<T>(), &slice_t);
+          ctx, &slice_tensor, &value_t, -1, SubFunctor<T>(), &slice_tensor);
     }
+    slice_tensor.Resize(slice_dims);
 
     // - Step 2.2 Pad slice tensor with 0
     pad_e.device(eigen_place) = pad_e.constant(T(0));
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
index 346ed965d0..8635def2ec 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
@@ -45,11 +45,25 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
     const int n = SizeToAxis(axis, logits->dims());
     const int d = SizeFromAxis(axis, logits->dims());
     std::vector<int> logits_dims = framework::vectorize<int>(logits->dims());
+
     // softmax
     auto& dev_ctx =
         context.template device_context<platform::XPUDeviceContext>();
-    int r = xpu::softmax(dev_ctx.x_context(), logits->data<float>(),
-                         softmax->data<float>(), logits_dims, axis);
+    int r = XPU_SUCCESS;
+    Tensor clip_logits;
+    int len = logits->numel();
+    T* clip_logits_data =
+        clip_logits.mutable_data<T>(context.GetPlace(), len * sizeof(T));
+    r = xpu::clip(dev_ctx.x_context(), logits->data<float>(), clip_logits_data,
+                  len, -1e30, 1e30);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU kernel error. clip "
+                                   "execution not succeed, error code=%d",
+                                   r));
+
+    r = xpu::softmax(dev_ctx.x_context(), clip_logits_data,
+                     softmax->data<float>(), logits_dims, axis);
 
     PADDLE_ENFORCE_EQ(
         r, xpu::Error_t::SUCCESS,
diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index 38ab60afd9..03d5324528 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -30,7 +30,7 @@ struct StackGradFunctor {
     int i = idx / (n_ * post_);
     int which_x = idx / post_ - i * n_;
     int x_index = i * post_ + idx % post_;
-    dx_[which_x][x_index] = dy_[idx];
+    if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx];
   }
 
  private:
@@ -95,19 +95,21 @@ class StackGradKernel : public framework::OpKernel<T> {
     auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis += dy->dims().size();
-
     int n = dy->dims()[axis];
     std::vector<T *> dx_datas(n);  // NOLINT
+
     for (int i = 0; i < n; i++) {
-      dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
+      if (dx[i] == nullptr) {
+        dx_datas[i] = nullptr;
+      } else {
+        dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
+      }
     }
     auto dy_data = dy->data<T>();
-
     int pre = 1;
     for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
     int total_num = dy->numel();
     int post = total_num / (n * pre);
-
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     auto dx_data_arr = dx_datas.data();
     StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 7451cac63d..e90eefd72d 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -159,8 +159,7 @@ class WarpCTCFunctor {
     warpctc_version_ = platform::dynload::get_warpctc_version();
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-// HIP not support ctcOptions in third-party warpctc
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = CTC_GPU;
       options_.stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                             ctx.device_context())
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index af0df2efc5..6c3c96b68c 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -91,30 +91,6 @@ enum class ActivationMode {
   kBandPass,
 };
 
-#if CUDNN_VERSION < 6000
-#pragma message "CUDNN version under 6.0 is supported at best effort."
-#pragma message "We strongly encourage you to move to 6.0 and above."
-#pragma message "This message is intended to annoy you enough to update."
-#pragma message \
-    "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/"
-
-inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
-  switch (mode) {
-    case PoolingMode::kMaximumDeterministic:
-      return CUDNN_POOLING_MAX;
-    case PoolingMode::kAverageExclusive:
-      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-    case PoolingMode::kAverageInclusive:
-      return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-    case PoolingMode::kMaximum:
-      return CUDNN_POOLING_MAX;
-    default:
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Unexpected CUDNN pooling mode."));
-  }
-}
-#else
-
 inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
   switch (mode) {
     case PoolingMode::kMaximumDeterministic:
@@ -130,7 +106,6 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
           platform::errors::Unimplemented("Unexpected CUDNN pooling mode."));
   }
 }
-#endif  // CUDNN_VERSION < 6000
 
 inline ActivationMode StringToActivationMode(const std::string& str) {
   if (str == "identity") {
@@ -471,19 +446,6 @@ class ScopedConvolutionDescriptor {
             "of pads is %d, size of dilations is %d.",
             pads.size(), dilations.size()));
 
-#if !CUDNN_VERSION_MIN(6, 0, 0)
-    // cudnn v5 does not support dilation conv, the argument is called upscale
-    // instead of dilations and it is must be one.
-    for (size_t i = 0; i < dilations.size(); ++i) {
-      PADDLE_ENFORCE_EQ(dilations[i], 1,
-                        platform::errors::InvalidArgument(
-                            "Dilations conv is not supported in this cuDNN "
-                            "version(%d.%d.%d).",
-                            CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
-                            CUDNN_VERSION % 100));
-    }
-#endif
-
     cudnnDataType_t compute_type =
         (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h
index a8ad729a31..0db4cc71b1 100644
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -24,7 +24,6 @@
 
 namespace Eigen {
 
-using bfloat16 = paddle::platform::bfloat16;
 using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
 using float16 = paddle::platform::float16;
@@ -33,7 +32,8 @@ template <typename T>
 struct NumTraits;
 
 template <>
-struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
+struct NumTraits<paddle::platform::bfloat16>
+    : GenericNumTraits<paddle::platform::bfloat16> {
   enum {
     IsSigned = true,
     IsInteger = false,
@@ -41,22 +41,22 @@ struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
     RequireInitialization = false
   };
 
-  HOSTDEVICE static inline bfloat16 epsilon() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 epsilon() {
     return paddle::platform::raw_uint16_to_bfloat16(0x3400);
   }
-  HOSTDEVICE static inline bfloat16 dummy_precision() {
-    return bfloat16(1e-5f);
+  HOSTDEVICE static inline paddle::platform::bfloat16 dummy_precision() {
+    return paddle::platform::bfloat16(1e-5f);
   }
-  HOSTDEVICE static inline bfloat16 highest() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 highest() {
     return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
   }
-  HOSTDEVICE static inline bfloat16 lowest() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 lowest() {
     return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
   }
-  HOSTDEVICE static inline bfloat16 infinity() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 infinity() {
     return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
   }
-  HOSTDEVICE static inline bfloat16 quiet_NaN() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 quiet_NaN() {
     return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
   }
 };
@@ -137,68 +137,91 @@ namespace numext {
 //////////// bfloat methods /////////////
 
 template <>
-HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+HOSTDEVICE inline bool(isnan)(const paddle::platform::bfloat16& a) {
   return (paddle::platform::isnan)(a);
 }
 
 template <>
-HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+HOSTDEVICE inline bool(isinf)(const paddle::platform::bfloat16& a) {
   return (paddle::platform::isinf)(a);
 }
 
 template <>
-HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+HOSTDEVICE inline bool(isfinite)(const paddle::platform::bfloat16& a) {
   return (paddle::platform::isfinite)(a);
 }
 
 template <>
-HOSTDEVICE inline bfloat16 exp(const bfloat16& a) {
-  return bfloat16(::expf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 exp(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::expf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 erf(const bfloat16& a) {
-  return bfloat16(::erff(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 erf(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::erff(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 log(const bfloat16& a) {
-  return bfloat16(::logf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 log(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::logf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) {
-  return bfloat16(::tanhf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 tanh(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::tanhf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) {
-  return bfloat16(::sqrtf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 sqrt(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::sqrtf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) {
-  return bfloat16(::ceilf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 ceil(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::ceilf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 floor(const bfloat16& a) {
-  return bfloat16(::floorf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 floor(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::floorf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 round(const bfloat16& a) {
-  return bfloat16(::roundf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 round(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::roundf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(::powf(static_cast<float>(a), static_cast<float>(b)));
+HOSTDEVICE inline paddle::platform::bfloat16 pow(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return paddle::platform::bfloat16(
+      ::powf(static_cast<float>(a), static_cast<float>(b)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 abs(const bfloat16& a) {
-  return bfloat16(::fabs(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 abs(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::fabs(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 mini(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return b < a ? b : a;
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 maxi(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return a < b ? b : a;
 }
 
 //////////// complex64 methods /////////////
@@ -398,5 +421,15 @@ HOSTDEVICE inline float16 abs(const float16& a) {
   return float16(::fabs(static_cast<float>(a)));
 }
 
+template <>
+HOSTDEVICE inline float16 mini(const float16& a, const float16& b) {
+  return b < a ? b : a;
+}
+
+template <>
+HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
+  return a < b ? b : a;
+}
+
 }  // namespace numext
 }  // namespace Eigen
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 1a55562f2b..fa77c0be03 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -564,3 +564,15 @@ DEFINE_string(tracer_mkldnn_ops_on, "",
  */
 DEFINE_string(tracer_mkldnn_ops_off, "",
               "List of OneDNN operation types to be turned off");
+
+/**
+ * CUDNN related FLAG
+ * Name: conv2d_disable_cudnn
+ * Since Version:
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Disable cudnn in conv2d.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
+#endif
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 6074d191ad..e8ba16398d 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -72,6 +72,7 @@ DECLARE_uint64(conv_workspace_size_limit);
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 DECLARE_bool(cudnn_deterministic);
 DECLARE_bool(cudnn_exhaustive_search);
+DECLARE_bool(conv2d_disable_cudnn);
 // data processing
 DECLARE_bool(enable_cublas_tensor_op_math);
 // device management
@@ -367,7 +368,8 @@ static void RegisterGlobalVarGetterSetter() {
       FLAGS_fraction_of_cuda_pinned_memory_to_use,
       FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb,
       FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
-      FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce);
+      FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce,
+      FLAGS_conv2d_disable_cudnn);
 #endif
 #ifdef PADDLE_WITH_XPU
   REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 58ef177863..eed3b3b769 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -611,15 +611,17 @@ void BindImperative(py::module *m_ptr) {
              // TODO(liym27): Try not to call TensorToPyArray because it always
              // copys data to cpu place, which reduces performance.
              if (parse_index && value_is_tensor) {
-               std::vector<int> axes, starts, ends, steps, decrease_axis,
+               std::vector<int> axes, starts, ends, steps, decrease_axes,
                    infer_flags;
                ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends,
-                                  &steps, &decrease_axis, &infer_flags);
+                                  &steps, &decrease_axes, &infer_flags);
 
-               framework::AttributeMap attrs = {{"axes", axes},
-                                                {"starts", starts},
-                                                {"ends", ends},
-                                                {"steps", steps}};
+               framework::AttributeMap attrs = {
+                   {"axes", axes},
+                   {"starts", starts},
+                   {"ends", ends},
+                   {"steps", steps},
+                   {"decrease_axes", decrease_axes}};
 
                imperative::NameVarBaseMap ins = {{"Input", {self}}};
                imperative::NameVarBaseMap outs = {{"Out", {self}}};
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index b1c42d91df..69856fa4fa 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -84,7 +84,8 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"matrix_nms", {"Out", "Index", "RoisNum"}},
     {"distribute_fpn_proposals",
      {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
-    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
     {"multiclass_nms3", {"Out", "NmsRoisNum"}},
     {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
     {"momentum", {"ParamOut", "VelocityOut"}},
@@ -137,7 +138,8 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
     {"update_loss_scaling",
      {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
-    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
     {"lamb",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
     {"rnn", {"DropoutState"}},
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c8ca3bf2c8..d8ee80c007 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -29,12 +29,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
 #include "paddle/fluid/framework/ir/pass_builder.h"
-#include "paddle/fluid/framework/load_op_lib.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -1753,7 +1751,6 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
-  m.def("load_op_library", framework::LoadOpLib);
   m.def("load_op_meta_info_and_register_op",
         framework::LoadOpMetaInfoAndRegisterOp);
   m.def("init_devices", []() { framework::InitDevices(); });
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index c5bb7ea472..ab2b6f58b9 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -30,10 +30,13 @@ taskkill /f /im op_function_generator.exe
 wmic process where name="op_function_generator.exe" call terminate
 taskkill /f /im python.exe  2>NUL
 
+<<<<<<< HEAD
 :: TODO: Temporarily，REMOVE after VS2017 is stable.
 set WITH_TPCACHE=OFF
 rmdir %cache_dir%\third_party_GPU /s/q
 rmdir %cache_dir%\third_party /s/q
+=======
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
 rem ------initialize common variable------
 if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
@@ -85,7 +88,7 @@ git show-ref --verify --quiet refs/heads/last_pr
 if %ERRORLEVEL% EQU 0 (
     git diff HEAD last_pr --stat --name-only
     git diff HEAD last_pr --stat --name-only | findstr "setup.py.in"
-    if %ERRORLEVEL% EQU 0 (
+    if !ERRORLEVEL! EQU 0 (
         rmdir build /s/q
     )
     git branch -D last_pr
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7a360ac229..7f184f1898 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1810,11 +1810,11 @@ function collect_ccache_hits() {
 
 function test_op_benchmark() {
     # The PR will pass quickly when get approval from specific person.
-    # Xreki 12538138, luotao1 6836917, GaoWei8 53294385
+    # Xreki 12538138, luotao1 6836917, Avin0323 16167147
     set +x
     approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
     if [ "${approval_line}" != "" ]; then
-        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 53294385 12538138 6836917)
+        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 16167147 12538138 6836917)
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
         if [ "${APPROVALS}" == "TRUE" ]; then
             echo "==================================="
diff --git a/python/paddle/README.md b/python/paddle/README.rst
similarity index 100%
rename from python/paddle/README.md
rename to python/paddle/README.rst
diff --git a/python/paddle/distributed/fleet/data_generator/data_generator.py b/python/paddle/distributed/fleet/data_generator/data_generator.py
index 669d2ea24a..9d743fc38b 100644
--- a/python/paddle/distributed/fleet/data_generator/data_generator.py
+++ b/python/paddle/distributed/fleet/data_generator/data_generator.py
@@ -32,11 +32,11 @@ class DataGenerator(object):
         '''
         Set batch size of current DataGenerator
         This is necessary only if a user wants to define generator_batch
-        
+
         Example:
 
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -52,7 +52,7 @@ class DataGenerator(object):
                                 yield ("words", s[1].extend([s[1][0]]))
                 mydata = MyData()
                 mydata.set_batch(128)
-                    
+
         '''
         self.batch_size_ = batch_size
 
@@ -63,7 +63,7 @@ class DataGenerator(object):
 
         Example:
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -100,9 +100,9 @@ class DataGenerator(object):
         generated.
 
         Example:
-        
+
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -161,7 +161,7 @@ class DataGenerator(object):
               The data format is list or tuple: 
             [(name, [feasign, ...]), ...] 
               or ((name, [feasign, ...]), ...)
-             
+
             For example:
             [("words", [1926, 08, 17]), ("label", [1])]
               or (("words", [1926, 08, 17]), ("label", [1]))
@@ -174,7 +174,7 @@ class DataGenerator(object):
         Example:
 
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -206,7 +206,7 @@ class DataGenerator(object):
         Example:
 
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -259,6 +259,9 @@ class MultiSlotStringDataGenerator(DataGenerator):
         Returns:
             Return a string data that can be read directly by the MultiSlotDataFeed.
         '''
+        if sys.version > '3' and isinstance(line, zip):
+            line = list(line)
+
         if not isinstance(line, list) and not isinstance(line, tuple):
             raise ValueError(
                 "the output of process() must be in list or tuple type"
@@ -289,7 +292,7 @@ class MultiSlotDataGenerator(DataGenerator):
             >>> [ids_num id1 id2 ...] ...
         The proto_info will be in this format:
             >>> [(name, type), ...]
-        
+
         For example, if the input is like this:
             >>> [("words", [1926, 08, 17]), ("label", [1])]
             >>> or (("words", [1926, 08, 17]), ("label", [1]))
@@ -304,6 +307,9 @@ class MultiSlotDataGenerator(DataGenerator):
         Returns:
             Return a string data that can be read directly by the MultiSlotDataFeed.
         '''
+        if sys.version > '3' and isinstance(line, zip):
+            line = list(line)
+
         if not isinstance(line, list) and not isinstance(line, tuple):
             raise ValueError(
                 "the output of process() must be in list or tuple type"
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 00d58cbd99..c3d27bcc4e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -47,7 +47,7 @@ def is_optimizer_op(op):
 
 
 class CollectiveHelper(object):
-    def __init__(self, role_maker, nrings=1, wait_port='6174'):
+    def __init__(self, role_maker, nrings=1, wait_port=True):
         self.nrings = nrings
         self.wait_port = wait_port
         self.role_maker = role_maker
@@ -65,14 +65,48 @@ class CollectiveHelper(object):
                 self.role_maker._worker_index(), ring_id, self.wait_port)
         self._broadcast_params()
 
-    def _init_communicator(self, program, current_endpoint, endpoints, rank,
-                           ring_id, wait_port):
+    def _init_communicator(self,
+                           program,
+                           current_endpoint,
+                           endpoints,
+                           rank,
+                           ring_id,
+                           wait_port,
+                           global_ring_id=None,
+                           sync=True):
         nranks = len(endpoints)
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
 
+        def _add_sync_by_allreduce(block):
+            sync_var = block.create_var(
+                name=unique_name.generate('sync_var'),
+                dtype=core.VarDesc.VarType.INT32,
+                persistable=False,
+                stop_gradient=True)
+            block.append_op(
+                type='fill_constant',
+                inputs={},
+                outputs={'Out': [sync_var]},
+                attrs={
+                    'shape': [1],
+                    'dtype': sync_var.dtype,
+                    'value': 1,
+                    'force_cpu': False,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_allreduce_sum',
+                inputs={'X': [sync_var]},
+                outputs={'Out': [sync_var]},
+                attrs={
+                    'ring_id': global_ring_id,
+                    'use_calc_stream': True,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
         block = program.global_block()
         if core.is_compiled_with_cuda():
             comm_id_var = block.create_var(
@@ -128,6 +162,7 @@ class CollectiveHelper(object):
             raise ValueError(
                 "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
             )
+        if sync: _add_sync_by_allreduce(block)
 
     def _wait(self, current_endpoint, endpoints):
         assert (self.wait_port)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index 9535c9ef53..b529e04fb4 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -19,130 +19,25 @@ from paddle.fluid import core, unique_name
 from ..base.private_helper_function import wait_server_ready
 from paddle.fluid.optimizer import PipelineOptimizer as PO
 from .meta_optimizer_base import MetaOptimizerBase
-from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
-
-
-def _get_node_num(endpoints):
-    ss = set()
-    for ep in endpoints:
-        ip = ep.split(":")[0].strip()
-        if ip not in ss:
-            ss.add(ip)
-    return len(ss)
-
-
-class PipelineHelper(object):
-    def __init__(self, role_maker, wait_port='6174'):
-        self.wait_port = wait_port
-        self.role_maker = role_maker
-
-    def update_startup_program(self,
-                               startup_program=None,
-                               inner_parallelism=None):
-        self.startup_program = startup_program
-
-        nranks = self.role_maker._worker_num()
-        rank = self.role_maker._worker_index()
-        endpoints = self.role_maker._get_trainer_endpoints()
-        current_endpoint = endpoints[rank]
-        node_num = _get_node_num(endpoints)
-        assert nranks % node_num == 0
-
-        # Create ring 0 for all gpus in the same pipeline
-        if inner_parallelism > 1:
-            pipeline_rank = rank % inner_parallelism
-            pipeline_id = rank // inner_parallelism
-            start_index = pipeline_id * inner_parallelism
-            pipeline_endpoints = endpoints[start_index:start_index +
-                                           inner_parallelism]
-            self._init_communicator(self.startup_program, current_endpoint,
-                                    pipeline_endpoints, pipeline_rank, 0,
-                                    self.wait_port)
-
-        pipeline_num = len(endpoints) // inner_parallelism
-        if pipeline_num == 1: return
-        # Create rings for gpus with the same pipeline id for data parallel
-        eps = []
-        pipeline_rank = rank % inner_parallelism
-        ring_id = pipeline_rank + 1
-        for i in range(pipeline_num):
-            eps.append(endpoints[i * inner_parallelism + pipeline_rank])
-        # rank in a ring of gpus with the same pipeline id for data parallel
-        dp_rank = rank // inner_parallelism
-        self._init_communicator(self.startup_program, current_endpoint, eps,
-                                dp_rank, ring_id, self.wait_port)
-        self._broadcast_params(ring_id)
-
-    def _init_communicator(self, program, current_endpoint, endpoints, rank,
-                           ring_id, wait_port):
-        nranks = len(endpoints)
-        other_endpoints = endpoints[:]
-        other_endpoints.remove(current_endpoint)
-        if rank == 0 and wait_port:
-            wait_server_ready(other_endpoints)
-
-        block = program.global_block()
-        nccl_id_var = block.create_var(
-            name=unique_name.generate('nccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints,
-                OP_ROLE_KEY: OpRole.Forward,
-            })
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': ring_id,
-                OP_ROLE_KEY: OpRole.Forward,
-            })
-
-    def _broadcast_params(self, ring_id):
-        block = self.startup_program.global_block()
-        for var_name in block.vars:
-            if "nccl_id" in var_name: continue
-            param = block.var(var_name)
-            if not param.persistable:
-                continue
-
-            block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': 0,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
-
-        block.append_op(
-            type='c_sync_comm_stream',
-            inputs={'X': param},
-            outputs={'Out': param},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Forward})
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op
 
 
 class PipelineOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(PipelineOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
+<<<<<<< HEAD
         # we do not allow meta optimizer to be inner optimizer currently
+=======
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
         self.meta_optimizers_white_list = [
             "RecomputeOptimizer",
             "AMPOptimizer",
         ]
         self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.global_ring_id = 1
+        self.dp_ring_id = 2
+        self.start_pipeline_ring_id = 20  # Just a magic number
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -165,7 +60,11 @@ class PipelineOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.pipeline = False
-        dist_strategy.pipeline_configs = {}
+        dist_strategy.pipeline_configs = {
+            "micro_batch_size": 1,
+            "accumulate_steps": 1,
+            "schedule_mode": "1F1B",
+        }
 
     def _enable_strategy(self, dist_strategy, context):
         dist_strategy.pipeline = True
@@ -175,61 +74,143 @@ class PipelineOptimizer(MetaOptimizerBase):
             "schedule_mode": "1F1B",
         }
 
+    def _broadcast_params(self, ring_id):
+        block = self.startup_program.global_block()
+        param = None
+        for param in block.iter_parameters():
+            if param.is_distributed:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        if not param: return  # no parameter on this device
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': param},
+            outputs={'Out': param},
+            attrs={'ring_id': ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+
+    def _get_process_group_info(self):
+        # global ring info
+        self.global_endpoints = self.endpoints
+        self.global_rank = self.rank
+        self.global_nranks = self.nranks
+
+        # data parallel ring info
+        if self.pipeline_num > 1:
+            self.dp_rank = self.rank // self.inner_parallelism
+            self.dp_nranks = self.nranks // self.inner_parallelism
+            start_index = self.rank % self.inner_parallelism
+            self.dp_endpoints = [
+                self.endpoints[start_index + i * self.inner_parallelism]
+                for i in range(self.pipeline_num)
+            ]
+
+    def _init_process_group(self, pipeline_pair, pipeline_ring_map):
+        self._get_process_group_info()
+        collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
+        # Create global ring for all gpus (ring_id = 0)
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.global_endpoints,
+            self.global_rank, self.global_ring_id, True, self.global_ring_id,
+            True)
+        # Create pipeline rings
+        if self.inner_parallelism > 1:
+            pipeline_id = self.rank // self.inner_parallelism
+            start_index = pipeline_id * self.inner_parallelism
+            for pair in pipeline_pair:
+                pair_key = pair[0] * 1000 + pair[1]
+                ring_id = pipeline_ring_map[pair_key]
+                assert ring_id >= self.start_pipeline_ring_id
+                first_node = pair[0] + start_index
+                second_node = pair[1] + start_index
+                if self.rank != first_node and self.rank != second_node:
+                    continue
+                pipeline_endpoints = [
+                    self.endpoints[first_node], self.endpoints[second_node]
+                ]
+                pipeline_rank = 0 if self.rank == first_node else 1
+                pipeline_nranks = 2
+                collective_helper._init_communicator(
+                    self.startup_program, self.current_endpoint,
+                    pipeline_endpoints, pipeline_rank, ring_id, False,
+                    self.global_ring_id, True)
+
+        # Create dp rings
+        if self.pipeline_num > 1:
+            collective_helper._init_communicator(
+                self.startup_program, self.current_endpoint, self.dp_endpoints,
+                self.dp_rank, self.dp_ring_id, True, self.global_ring_id, True)
+            self._broadcast_params(self.dp_ring_id)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        endpoints = self.role_maker._get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker._worker_index()]
-        self.wrapped_opt = PO(self.inner_opt,
-                              num_microbatches=self.num_microbatches)
-        node_num = _get_node_num(endpoints)
-        gpus_per_node = len(endpoints) // node_num
-        self.startup_program = startup_program
-        if startup_program is None:
-            self.startup_program = fluid.default_startup_program()
-
+        self.endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
         self.rank = self.role_maker._worker_index()
         self.nranks = self.role_maker._worker_num()
-        assert self.nranks % node_num == 0
 
+<<<<<<< HEAD
         loss.block.program._pipeline_opt = dict()
         loss.block.program._pipeline_opt['local_rank'] = self.rank
         loss.block.program._pipeline_opt[
             'micro_batch_size'] = self.micro_batch_size
         loss.block.program._pipeline_opt['schedule_mode'] = self.schedule_mode
         optimize_ops, params_grads, prog_list = self.wrapped_opt.minimize(
+=======
+        self.wrapped_opt = PO(self.inner_opt,
+                              num_microbatches=self.num_microbatches)
+        orig_startup_program = startup_program if startup_program else fluid.default_startup_program(
+        )
+        block = loss.block
+        program = block.program
+
+        program._pipeline_opt = dict()
+        program._pipeline_opt['local_rank'] = self.rank
+        program._pipeline_opt['global_ring_id'] = self.global_ring_id
+        program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id
+        program._pipeline_opt['micro_batch_size'] = self.micro_batch_size
+        program._pipeline_opt['schedule_mode'] = self.schedule_mode
+        optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize(
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
             loss, startup_program, parameter_list, no_grad_set)
-        assert prog_list
-
-        self.main_program_list = prog_list
-        self.main_program = loss.block.program
-        self.inner_parallelism = loss.block.program._pipeline_opt[
-            'inner_parallelism']
+        self.startup_program = orig_startup_program._pipeline_opt[
+            'startup_program']
+        self.inner_parallelism = program._pipeline_opt['inner_parallelism']
         assert self.nranks % self.inner_parallelism == 0
+        assert prog_list
+        self.pipeline_num = len(self.endpoints) // self.inner_parallelism
 
-        pipeline_helper = PipelineHelper(self.role_maker)
-        pipeline_helper.update_startup_program(
-            self.startup_program._pipeline_opt["startup_program"],
-            self.inner_parallelism)
+        self._init_process_group(pp_pair, ring_map)
 
-        pipeline_num = self.nranks // self.inner_parallelism
-        self._transpile_main_program(loss, pipeline_num, self.inner_parallelism)
+        self.main_program_list = prog_list
+        self.main_program = program
+        if self.pipeline_num > 1:
+            self._transpile_main_program(loss)
         return optimize_ops, params_grads
 
-    def _transpile_main_program(self, loss, pipeline_num, inner_parallelism):
-        if pipeline_num <= 1: return
-        self._insert_loss_grad_ops(loss, pipeline_num)
-        for ring_id in range(1, inner_parallelism + 1):
-            self._insert_allreduce_ops(ring_id)
+    def _transpile_main_program(self, loss):
+        self._insert_loss_grad_ops(loss, self.pipeline_num)
+        self._insert_allreduce_ops(self.dp_ring_id)
 
     def _insert_loss_grad_ops(self, loss, pipeline_num):
         """
         In order to keep the learning rate consistent in different numbers of
         training workers, we scale the loss grad by the number of workers
         """
-        block = self.main_program_list[-1]['program'].global_block()
+        block = self.main_program_list[-1].global_block()
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_loss_grad_op(op):
                 loss_grad_var = block.vars[op.output_arg_names[0]]
@@ -244,57 +225,53 @@ class PipelineOptimizer(MetaOptimizerBase):
                     })
 
     def _insert_allreduce_ops(self, ring_id):
-        block = self.main_program_list[ring_id - 1]['program'].global_block()
+        block = self.main_program._pipeline_opt['section_program'].global_block(
+        )
         origin_block = self.main_program.global_block()
         grad = None
         processed_param_name = set()
+        first_optimize_op_idx = None
+        add_sync_calc_stream = False
         for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and not first_optimize_op_idx:
+                first_optimize_op_idx = idx + 1
+                # no optimize phase
+                if first_optimize_op_idx == len(block.ops): return
             if is_backward_op(op) and \
                     OP_ROLE_VAR_KEY in op.attr_names:
                 op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
                 if len(op_role_var) == 0:
                     continue
                 assert len(op_role_var) % 2 == 0
-                offset = idx
+                offset = 0
                 for i in range(0, len(op_role_var), 2):
                     param_name = op_role_var[i]
                     param = block.vars[op_role_var[i]]
                     if param_name in processed_param_name: continue
                     processed_param_name.add(param_name)
-                    grad = block.vars[op_role_var[i + 1]]
+                    grad_name = op_role_var[i + 1]
+                    if not 'MERGED' in grad_name: grad_name += '@MERGED'
+                    grad = block.vars[grad_name]
                     origin_param = origin_block.vars[op_role_var[i]]
                     if origin_param.is_distributed:
                         continue
-                    if offset == idx:
-                        offset += 1
+                    if not add_sync_calc_stream:
+                        add_sync_calc_stream = True
                         block._insert_op(
-                            offset,
+                            first_optimize_op_idx + offset,
                             type='c_sync_calc_stream',
                             inputs={'X': grad},
                             outputs={'Out': grad},
-                            attrs={OP_ROLE_KEY: OpRole.Backward})
+                            attrs={OP_ROLE_KEY: OpRole.Optimize})
                         offset += 1
 
                     block._insert_op(
-                        offset,
+                        first_optimize_op_idx + offset,
                         type='c_allreduce_sum',
                         inputs={'X': grad},
                         outputs={'Out': grad},
                         attrs={
                             'ring_id': ring_id,
-                            OP_ROLE_KEY: OpRole.Backward
+                            'use_calc_stream': True,
+                            OP_ROLE_KEY: OpRole.Optimize
                         })
-
-        if grad is None:
-            return
-
-        for idx, op in enumerate(block.ops):
-            if is_optimizer_op(op):
-                block._insert_op(
-                    idx,
-                    type='c_sync_comm_stream',
-                    inputs={'X': grad},
-                    outputs={'Out': grad},
-                    attrs={'ring_id': ring_id,
-                           OP_ROLE_KEY: OpRole.Backward})
-            break
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index b24da29d0f..ae34186878 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -230,6 +230,7 @@ def __bootstrap__():
             'gpu_allocator_retry_time',
             'local_exe_sub_scope_limit',
             'gpu_memory_limit_mb',
+            'conv2d_disable_cudnn',
         ]
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
diff --git a/python/paddle/fluid/tests/custom_op/setup_install.py b/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py
similarity index 46%
rename from python/paddle/fluid/tests/custom_op/setup_install.py
rename to python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py
index 18fbfbaf8b..8c05bc4899 100644
--- a/python/paddle/fluid/tests/custom_op/setup_install.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,19 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 
-from utils import paddle_includes, extra_compile_args
-from paddle.utils.cpp_extension import CUDAExtension, setup
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
+from __future__ import print_function
 
-# switch to old custom op method
-use_new_custom_op_load_method(False)
+from . import amp_lists
+from .amp_lists import *
+from . import amp_utils
+from .amp_utils import *
 
-setup(
-    name='custom_relu2',
-    ext_modules=CUDAExtension(  # test for not specific name here.
-        sources=['relu_op.cc', 'relu_op.cu', 'relu_op3.cc',
-                 'relu_op3.cu'],  # test for multi ops
-        include_dirs=paddle_includes,
-        extra_compile_args=extra_compile_args))
+__all__ = []
+__all__ += amp_lists.__all__
+__all__ += amp_utils.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
new file mode 100644
index 0000000000..81dc32d114
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from ..fp16_lists import white_list as white_list_fp16, black_list as black_list_fp16,\
+    gray_list as gray_list_fp16, unsupported_fp16_list
+
+__all__ = ["AutoMixedPrecisionListsBF16"]
+
+
+class AutoMixedPrecisionListsBF16(object):
+    """
+    AutoMixedPrecisionListsBF16 is a class for fp32/bf16 op types list. The lists are used for an
+    algorithm which determines op's execution mode (fp32 or bf16).It can update pre-defined
+    fp32 list and bf16 list according to users' custom fp32 bf16 lists.
+
+    Args:
+        custom_bf16_list (set): Users' custom bf16 list.
+        custom_fp32_list (set): Users' custom fp32 list.
+        custom_fp32_varnames (set): Users' custom fp32 variables' names.
+
+    Examples:
+        .. code-block:: python
+        import paddle
+        paddle.enable_static()
+        with paddle.static.amp.bf16_guard():
+            paddle.static.amp.AutoMixedPrecisionListsBF16(custom_fp32_list={'lstm'})
+    """
+
+    def __init__(self,
+                 custom_bf16_list=None,
+                 custom_fp32_list=None,
+                 custom_fp32_varnames=None):
+        self._custom_bf16_list = custom_bf16_list
+        self._custom_fp32_list = custom_fp32_list
+        self.bf16_list = copy.copy(bf16_list)
+        self.fp32_list = copy.copy(fp32_list)
+        self.gray_list = copy.copy(gray_list)
+        self.unsupported_list = copy.copy(unsupported_list)
+        self.fp32_varnames = copy.copy(custom_fp32_varnames)
+        self._update_list()
+
+    def _update_list(self):
+        """
+        Update fp32 and bf16 list according to users' custom list.
+        """
+        if self._custom_bf16_list and self._custom_fp32_list:
+            for op_name in self._custom_bf16_list:
+                if op_name in self._custom_fp32_list:
+                    raise ValueError("Custom bf16 list overlap "
+                                     "custom fp32 list")
+        if self._custom_bf16_list:
+            for op_name in self._custom_bf16_list:
+                if op_name in self.fp32_list:
+                    self.fp32_list.remove(op_name)
+                elif op_name in self.gray_list:
+                    self.gray_list.remove(op_name)
+                self.bf16_list.add(op_name)
+        if self._custom_fp32_list:
+            for op_name in self._custom_fp32_list:
+                if op_name in self.bf16_list:
+                    self.bf16_list.remove(op_name)
+                elif op_name in self.gray_list:
+                    self.gray_list.remove(op_name)
+                self.fp32_list.add(op_name)
+                self.unsupported_list.add(op_name)
+
+
+# always bf16
+bf16_list = {'elementwise_add', }
+
+# depends on the prev_op type
+gray_list = {
+    'reshape2',
+    'lookup_table',
+}
+
+unsupported_list = unsupported_fp16_list.copy().copy()
+fp32_list = black_list_fp16.copy().copy()
+fp32_list |= white_list_fp16
+fp32_list |= gray_list_fp16
+
+fp32_list -= bf16_list
+fp32_list -= gray_list
+unsupported_list -= bf16_list
+unsupported_list -= gray_list
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
new file mode 100644
index 0000000000..c2c01f88c7
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
@@ -0,0 +1,296 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import struct
+
+from .... import core
+from .... import framework
+from ....log_helper import get_logger
+from ....wrapped_decorator import signature_safe_contextmanager
+from .amp_lists import AutoMixedPrecisionListsBF16
+from ..fp16_utils import find_true_prev_op, find_true_post_op, _rename_arg, find_op_index
+import logging
+import numpy as np
+
+__all__ = ["bf16_guard", "rewrite_program_bf16", "convert_float_to_uint16"]
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+_valid_types = [
+    core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.SELECTED_ROWS,
+    core.VarDesc.VarType.LOD_TENSOR_ARRAY
+]
+
+_bf16_guard_pattern = "__use_bf16__"
+
+
+def convert_float_to_uint16(in_list):
+    in_list = np.asarray(in_list)
+    out = np.vectorize(
+        lambda x: struct.unpack('<I', struct.pack('<f', x))[0] >> 16,
+        otypes=[np.uint16])(in_list.flat)
+    return np.reshape(out, in_list.shape)
+
+
+def _dtype_to_str(dtype):
+    """
+    Convert specific variable type to its corresponding string.
+
+    Args:
+        dtype (VarType): Variable type.
+    """
+    if dtype == core.VarDesc.VarType.BF16:
+        return 'bf16'
+    else:
+        return 'fp32'
+
+
+def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
+    """
+    Insert cast op and rename args of input and output.
+
+    Args:
+        block (Program): The block in which the operator is.
+        op (Operator): The operator to insert cast op.
+        idx (int): The index of current operator.
+        src_dtype (VarType): The input variable dtype of cast op.
+        dest_dtype (VarType): The output variable dtype of cast op.
+
+    Returns:
+        num_cast_op (int): The number of cast ops that have been inserted.
+    """
+    num_cast_ops = 0
+
+    for in_name in op.input_names:
+        if src_dtype == core.VarDesc.VarType.FP32 and op.type in [
+                'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+        ]:
+            if in_name not in {'X', 'Z'}:
+                continue
+        for in_var_name in op.input(in_name):
+            in_var = block.var(in_var_name)
+            if in_var.type not in _valid_types or in_var.dtype == dest_dtype:
+                continue
+            if in_var.dtype == src_dtype:
+                cast_name = in_var.name + '.cast_' + _dtype_to_str(dest_dtype)
+                out_var = block.vars.get(cast_name)
+                if out_var is None or out_var.dtype != dest_dtype:
+                    out_var = block.create_var(
+                        name=cast_name,
+                        dtype=dest_dtype,
+                        persistable=False,
+                        stop_gradient=in_var.stop_gradient)
+
+                    block._insert_op(
+                        idx,
+                        type="cast",
+                        inputs={"X": in_var},
+                        outputs={"Out": out_var},
+                        attrs={
+                            "in_dtype": in_var.dtype,
+                            "out_dtype": out_var.dtype
+                        })
+                    num_cast_ops += 1
+                _rename_arg(op, in_var.name, out_var.name)
+            else:
+                if op.has_attr('in_dtype'):
+                    op._set_attr('in_dtype', dest_dtype)
+    if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.BF16:
+        for out_name in op.output_names:
+            if op.type in [
+                    'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+            ] and out_name != 'Y':
+                continue
+            for out_var_name in op.output(out_name):
+                out_var = block.var(out_var_name)
+                if out_var.type not in _valid_types:
+                    continue
+                if out_var.dtype == core.VarDesc.VarType.FP32:
+                    out_var.desc.set_dtype(core.VarDesc.VarType.BF16)
+                    if op.has_attr('out_dtype'):
+                        op._set_attr('out_dtype', core.VarDesc.VarType.BF16)
+    return num_cast_ops
+
+
+def _is_in_fp32_varnames(op, amp_lists):
+    for in_name in op.input_arg_names:
+        if in_name in amp_lists.fp32_varnames:
+            return True
+
+    for out_name in op.output_arg_names:
+        if out_name in amp_lists.fp32_varnames:
+            return True
+
+    return False
+
+
+def _need_keep_fp32(op, unsupported_op_list, use_bf16_guard):
+    if op.type in unsupported_op_list:
+        # the highest priority condition: If ops don't have bf16 computing kernels,
+        # they must be executed in fp32 calculation pattern.
+        return True
+
+    # process ops about learning rate
+    in_out_arg_names = []
+    in_out_arg_names.extend(list(op.input_arg_names))
+    in_out_arg_names.extend(list(op.output_arg_names))
+    for name in in_out_arg_names:
+        if "learning_rate" in name:
+            return True
+
+    if use_bf16_guard:
+        if op.has_attr("op_namescope") and \
+                (_bf16_guard_pattern in op.attr("op_namescope")):
+            # op in bf16 guard
+            return False
+        else:
+            # op not in bf16 guard
+            return True
+    else:
+        return False
+
+
+@signature_safe_contextmanager
+def bf16_guard():
+    """
+    As for the pure bf16 training, if users set `use_bf16_guard` to True,
+    only those ops created in the context manager `bf16_guard` will be
+    transformed as float16 type.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            paddle.enable_static()
+            data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
+            conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
+
+            with paddle.static.amp.bf16_guard():
+                bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
+                pool = F.max_pool2d(bn, kernel_size=2, stride=2)
+                hidden = paddle.static.nn.fc(pool, size=10)
+                loss = paddle.mean(hidden)
+    """
+    with framework.name_scope(prefix=_bf16_guard_pattern):
+        yield
+
+
+def rewrite_program_bf16(main_prog, amp_lists=None, use_bf16_guard=False):
+    """
+    Traverse all ops in current block and insert cast op according to
+    which set current op belongs to.
+
+    1. When an op belongs to the fp32 list, add it to fp32 set
+    2. When an op belongs to the bf16 list, add it to bf16 set
+    3. When an op belongs to the gray list. If one
+       of its inputs is the output of fp32 set op or fp32 list op,
+       add it to fp32 set. If all of its previous ops are not fp32
+       op and one of its inputs is the output of bf16 set op or
+       bf16 list op, add it to bf16 set.
+    4. When an op isn't in the lists, add it to fp32 op set.
+    5. Add necessary cast ops to make sure that fp32 set op will be
+       computed in fp32 mode, while bf16 set op will be computed in
+       bf16 mode.
+
+    Args:
+        main_prog (Program): The main program for training.
+    """
+    if amp_lists is None:
+        amp_lists = AutoMixedPrecisionListsBF16()
+    block = main_prog.global_block()
+    ops = block.ops
+    bf16_op_set = set()
+    fp32_op_set = set()
+    for op in ops:
+
+        # NOTE(zhiqiu): 'create_py_reader' and 'read' is used in non-iterable DataLoder,
+        # we don't need to handle reader op and the input of 'create_py_reader' is not
+        # in block, which may result in errors.
+        # See GeneratorLoader._init_non_iterable() for details.
+        if op.type == 'create_py_reader' or op.type == 'read':
+            continue
+
+        if amp_lists.fp32_varnames is not None and _is_in_fp32_varnames(
+                op, amp_lists):
+            fp32_op_set.add(op)
+            continue
+
+        if op.type in amp_lists.fp32_list or _need_keep_fp32(
+                op, amp_lists.unsupported_list, use_bf16_guard):
+            fp32_op_set.add(op)
+        elif op.type in amp_lists.bf16_list:
+            bf16_op_set.add(op)
+        elif op.type in amp_lists.gray_list:
+            is_fp32_op = False
+            is_bf16_op = False
+            for in_name in op.input_names:
+                # if this op has inputs
+                if in_name:
+                    for in_var_name in op.input(in_name):
+                        in_var = block.var(in_var_name)
+                        # this in_var isn't the output of other op
+                        if in_var.op is None:
+                            continue
+                        elif in_var.op is op:
+                            prev_op = find_true_prev_op(ops, op, in_var_name)
+                            if prev_op is None:
+                                continue
+                        else:
+                            prev_op = in_var.op
+                        # if it's one of inputs
+                        if prev_op in fp32_op_set or \
+                                prev_op.type in amp_lists.fp32_list:
+                            is_fp32_op = True
+                        elif prev_op in bf16_op_set or \
+                                prev_op.type in amp_lists.bf16_list:
+                            is_bf16_op = True
+            if is_fp32_op:
+                fp32_op_set.add(op)
+            elif is_bf16_op:
+                bf16_op_set.add(op)
+            else:
+                pass
+        else:
+            # For numerical safe, we apply fp32 computation on ops that
+            # are not determined which list they should stay.
+            fp32_op_set.add(op)
+
+    idx = 0
+    while idx < len(ops):
+        op = ops[idx]
+        num_cast_ops = 0
+        if op in fp32_op_set:
+            num_cast_ops = _insert_cast_op(block, op, idx,
+                                           core.VarDesc.VarType.BF16,
+                                           core.VarDesc.VarType.FP32)
+        elif op in bf16_op_set:
+            if op.has_attr('use_mkldnn'):
+                op._set_attr('use_mkldnn', True)
+                op._set_attr('mkldnn_data_type', 'bfloat16')
+            elif op.has_attr('dtype') and op.attr(
+                    'dtype') == core.VarDesc.VarType.FP32:
+                op._set_attr('dtype', core.VarDesc.VarType.BF16)
+
+            num_cast_ops = _insert_cast_op(block, op, idx,
+                                           core.VarDesc.VarType.FP32,
+                                           core.VarDesc.VarType.BF16)
+        else:
+            pass
+
+        idx += num_cast_ops + 1
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index f9c3a613c4..67e83a2ec4 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -123,7 +123,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         outputs={"Out": out_var},
                         attrs={
                             "in_dtype": in_var.dtype,
-                            "out_dtype": out_var.dtype
+                            "out_dtype": out_var.dtype,
+                            "op_device": op.attr("op_device")
                         })
                     num_cast_ops += 1
                 _rename_arg(op, in_var.name, out_var.name)
@@ -171,8 +172,11 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
             type="cast",
             inputs={"X": target_var},
             outputs={"Out": cast_var},
-            attrs={"in_dtype": target_var.dtype,
-                   "out_dtype": cast_var.dtype})
+            attrs={
+                "in_dtype": target_var.dtype,
+                "out_dtype": cast_var.dtype,
+                "op_device": op.attr("op_device")
+            })
         num_cast_ops += 1
         op_var_rename_map[block.idx][target_var.name] = cast_var.name
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 68b4cfdc66..7f31025575 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -21,14 +21,17 @@ import warnings
 
 import paddle
 from paddle.fluid import dygraph, core, framework, unique_name
-from paddle.fluid.executor import Executor
+from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Constant
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.io import load_inference_model, save_inference_model
 from paddle.fluid.log_helper import get_logger
-from . import quant_nn
 from .. import quantization_pass
+<<<<<<< HEAD
+=======
+from . import quant_nn
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 from . import utils
 
 __all__ = ['ImperativeQuantAware']
@@ -201,7 +204,11 @@ class ImperativeQuantAware(object):
 
         self._quantize_inputs = ImperativeQuantizeInputs(**kwargs)
 
+<<<<<<< HEAD
         self._calc_output_scale = ImperativeCalcOutputScale()
+=======
+        self._quantize_outputs = ImperativeQuantizeOutputs()
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
     def quantize(self, model):
         """
@@ -219,11 +226,19 @@ class ImperativeQuantAware(object):
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
         self._quantize_inputs.apply(model)
+<<<<<<< HEAD
         self._calc_output_scale.apply(model)
 
     def save_quantized_model(self, layer, path, input_spec=None, **config):
         self._calc_output_scale.save_quantized_model(layer, path, input_spec,
                                                      **config)
+=======
+        self._quantize_outputs.apply(model)
+
+    def save_quantized_model(self, layer, path, input_spec=None, **config):
+        self._quantize_outputs.save_quantized_model(layer, path, input_spec,
+                                                    **config)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
 
 class ImperativeQuantizeInputs(object):
@@ -251,8 +266,13 @@ class ImperativeQuantizeInputs(object):
         super(ImperativeQuantizeInputs, self).__init__()
 
         self._quantizable_layer_type = tuple(
+<<<<<<< HEAD
             utils.supported_quant_layers_map[layer]
             if layer in utils.supported_quant_layers_map else layer
+=======
+            utils.quant_input_layers_map[layer]
+            if layer in utils.quant_input_layers_map else layer
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
             for layer in quantizable_layer_type)
         for layer in self._quantizable_layer_type:
             assert not isinstance(layer, str), \
@@ -323,6 +343,7 @@ class ImperativeQuantizeInputs(object):
                 idx += 1
             target = name[last_idx:idx]
 
+<<<<<<< HEAD
             quant_layer = self._get_quantized_layer(layer)
             setattr(quant_layer, "layer_name", layer.full_name())
             setattr(obj, target, quant_layer)
@@ -330,6 +351,14 @@ class ImperativeQuantizeInputs(object):
     def _get_quantized_layer(self, layer):
         quant_layer_name = None
         for key, value in utils.supported_quant_layers_map.items():
+=======
+            quant_layer = self._get_input_quantized_layer(layer)
+            setattr(obj, target, quant_layer)
+
+    def _get_input_quantized_layer(self, layer):
+        quant_layer_name = None
+        for key, value in utils.quant_input_layers_map.items():
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
             if isinstance(layer, value):
                 quant_layer_name = 'Quantized' + key
                 break
@@ -340,19 +369,36 @@ class ImperativeQuantizeInputs(object):
         layer_with_weight = ['QuantizedConv2D', 'QuantizedLinear']
         if quant_layer_name not in layer_with_weight:
             quant_layer_name = 'QuantizedNoweightLayer'
+<<<<<<< HEAD
 
         return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs)
+=======
 
+        return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs)
+
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
+class ImperativeQuantizeOutputs(object):
+    """
+    Calculate the output scales for some layers.
+    """
+
+<<<<<<< HEAD
 class ImperativeCalcOutputScale(object):
     def __init__(self, moving_rate=0.9):
         """
         Add the logic of calculating and setting output scales of some layers. 
+=======
+    def __init__(self, moving_rate=0.9):
+        """
+        The constructor for ImperativeQuantizeOutputs.
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
         Args:
             moving_rate(float): The decay coefficient of moving average.
                                 The default value is 0.9.
         """
+<<<<<<< HEAD
         super(ImperativeCalcOutputScale, self).__init__()
         self._moving_rate = moving_rate
         self._register_hook_handle_list = []
@@ -362,6 +408,15 @@ class ImperativeCalcOutputScale(object):
         """
         Insert the `moving_average_abs_max_scale` op to calculate output 
         scale of specific layers in model.
+=======
+        super(ImperativeQuantizeOutputs, self).__init__()
+        self._moving_rate = moving_rate
+
+    def apply(self, model):
+        """
+        Insert the `moving_average_abs_max_scale` layers to calculate the
+        output scales for specific layers in the dygraph model.
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
         Args:
             model(fluid.dygraph.Layer): The target model which would be
@@ -372,12 +427,35 @@ class ImperativeCalcOutputScale(object):
         """
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
+<<<<<<< HEAD
         for _, layer in model.named_sublayers():
             if self._is_target_layer(layer):
                 self._init_scale_params(layer)
                 hook_handle = layer.register_forward_post_hook(
                     self._calc_output_scale_hook)
                 self._register_hook_handle_list.append(hook_handle)
+=======
+
+        for name, layer in model.named_sublayers():
+            if not self._is_target_layer(layer):
+                continue
+
+            # TODO(jc): optimize this module
+            last_idx = 0
+            idx = 0
+            obj = model
+            while idx < len(name):
+                if (name[idx] == '.'):
+                    if hasattr(obj, name[last_idx:idx]):
+                        obj = getattr(obj, name[last_idx:idx])
+                        last_idx = idx + 1
+                idx += 1
+            target = name[last_idx:idx]
+
+            quant_layer = quant_nn.__dict__["QuantizedOutputLayer"](
+                layer, self._moving_rate)
+            setattr(obj, target, quant_layer)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
     def save_quantized_model(self, layer, path, input_spec=None, **config):
         """
@@ -407,7 +485,10 @@ class ImperativeCalcOutputScale(object):
         Returns:
             None
         """
+        assert isinstance(layer, dygraph.Layer), \
+            "The model must be the instance of dygraph.Layer."
 
+<<<<<<< HEAD
         assert isinstance(layer, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
 
@@ -430,26 +511,41 @@ class ImperativeCalcOutputScale(object):
         paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config)
 
         # load static model
+=======
+        paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config)
+
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
         is_dynamic_mode = False
         if paddle.in_dynamic_mode():
             is_dynamic_mode = True
             paddle.enable_static()
 
+<<<<<<< HEAD
         place = core.CUDAPlace(0) if core.is_compiled_with_cuda() \
             else core.CPUPlace()
+=======
+        place = core.CPUPlace()
+        scope = global_scope()
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
         exe = Executor(place)
 
         dirname = os.path.dirname(path)
         basename = os.path.basename(path)
         model_filename = basename + INFER_MODEL_SUFFIX
         params_filename = basename + INFER_PARAMS_SUFFIX
+<<<<<<< HEAD
         [inference_program, feed_target_names, fetch_targets] = (
+=======
+
+        [infer_program, feed_target_names, fetch_targets] = (
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
             load_inference_model(
                 dirname=dirname,
                 executor=exe,
                 model_filename=model_filename,
                 params_filename=params_filename))
 
+<<<<<<< HEAD
         # set output scales to the static model
         check_behind_op = False
         op_count = 0
@@ -518,12 +614,18 @@ class ImperativeCalcOutputScale(object):
             self._set_skip_quant_attr(inference_program)
 
         # save the final quantized model that has output scales
+=======
+        self._save_output_scale(infer_program, scope)
+
+        self._set_skip_quant_attr(infer_program)
+
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
         save_inference_model(
             dirname=dirname,
             feeded_var_names=feed_target_names,
             target_vars=fetch_targets,
             executor=exe,
-            main_program=inference_program.clone(),
+            main_program=infer_program.clone(),
             model_filename=model_filename,
             params_filename=params_filename)
 
@@ -531,6 +633,7 @@ class ImperativeCalcOutputScale(object):
             paddle.disable_static()
 
     def _is_target_layer(self, layer):
+<<<<<<< HEAD
         return isinstance(layer, utils.out_scale_layers_list) \
             or 'quantized_' in layer.full_name()
 
@@ -597,6 +700,43 @@ class ImperativeCalcOutputScale(object):
         for op in block.ops:
             if self._is_skip_quant_op(block, op):
                 op._set_attr("skip_quant", True)
+=======
+        """
+        Whether the layer needs to calculate output scales.
+        """
+        return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \
+            or ('quantized' in layer.full_name() and \
+                'quantized_noweight' not in layer.full_name())
+
+    def _save_output_scale(self, program, scope):
+        """
+        Save all output scales to the corresponding ops in static
+        inference program and delete 'moving_average_abs_max_scale' ops.
+        """
+        for block in program.blocks:
+            for op in block.ops:
+                if op.type == "moving_average_abs_max_scale":
+                    in_var_name = op.input('X')[0]
+                    out_var_name = op.output('Out')[0]
+                    out_scale_name = op.output('OutScale')[0]
+
+                    out_scale = utils.load_variable_data(scope, out_scale_name)
+                    previous_op = utils.find_previous_op(block, in_var_name)
+                    previous_op._set_attr("out_threshold", float(out_scale))
+
+                    next_ops = utils.find_next_ops(block, out_var_name)
+                    for next_op in next_ops:
+                        next_op._rename_input(out_var_name, in_var_name)
+
+    def _set_skip_quant_attr(self, program):
+        """
+        Label the skip quantized ops.
+        """
+        for block in program.blocks:
+            for op in block.ops:
+                if self._is_skip_quant_op(block, op):
+                    op._set_attr("skip_quant", True)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
     def _is_skip_quant_op(self, block, in_op):
         """
@@ -604,16 +744,20 @@ class ImperativeCalcOutputScale(object):
         1. the type of input op should be conv2d, depthwise_conv2d or matmul
         2. the previous ops of the input op are not fake_quantize_dequantize ops
         """
+<<<<<<< HEAD
 
         def _find_previous_op(block, var_name):
             for op in block.ops:
                 if var_name in op.output_arg_names:
                     return op
 
+=======
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
         target_op_types = ["conv2d", "depthwise_conv2d", "matmul"]
         if in_op.type not in target_op_types:
             return False
 
+<<<<<<< HEAD
         previous_ops = [_find_previous_op(block, arg_name) \
             for arg_name in in_op.input_arg_names]
         return any(op is not None and op.type not in utils.fake_quantize_dequantize_types \
@@ -634,3 +778,9 @@ class ImperativeCalcOutputScale(object):
                     layer, output.name, self._moving_rate, output.dtype)
             # TODO (jc): consider the ops that have several outputs 
             self._out_scale(output)
+=======
+        previous_ops = [utils.find_previous_op(block, arg_name) \
+            for arg_name in in_op.input_arg_names]
+        return any(op is not None and op.type not in \
+            utils.fake_quantize_dequantize_types for op in previous_ops)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index 3c4fb323bc..f6fef0689d 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -507,59 +507,42 @@ class QuantizedNoweightLayer(layers.Layer):
 
 
 class MovingAverageAbsMaxScale(layers.Layer):
-    def __init__(self, layer=None, name=None, moving_rate=0.9, dtype='float32'):
+    def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
         r"""
-        MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer.
-        Its computational formula is described as below:
+        MovingAverageMaxScale layer is used to calculating the output quantization
+        scale of Layer. Its computational formula is described as below:
 
         :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
         :math:`Out = X`
         """
         super(MovingAverageAbsMaxScale, self).__init__()
         self._moving_rate = moving_rate
-        self._dtype = dtype
-        self._layer = layer
 
-        if self._layer is None or not hasattr(self._layer, "_quant_out_scale"):
-            scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
-            scale_name = unique_name.generate(scale_prefix)
-            scale_attr = ParamAttr(
-                name=scale_name, initializer=Constant(1), trainable=False)
-            self._scale = self.create_parameter(
-                shape=[1], attr=scale_attr, dtype=self._dtype)
-            self._scale.stop_gradient = True
-            if self._layer is not None:
-                setattr(self._layer, "_quant_out_scale", self._scale)
-        else:
-            self._scale = self._layer._quant_out_scale
+        scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
+        scale_name = unique_name.generate(scale_prefix)
+        scale_attr = ParamAttr(
+            name=scale_name, initializer=Constant(1), trainable=False)
+        self._scale = self.create_parameter(
+            shape=[1], attr=scale_attr, dtype=dtype)
+        self._scale.stop_gradient = True
 
-        if self._layer is None or not hasattr(self._layer, "_quant_out_state"):
-            state_prefix = "{}.state".format(name) if name else 'outscale.state'
-            state_attr = ParamAttr(
-                name=unique_name.generate(state_prefix),
-                initializer=Constant(1),
-                trainable=False)
-            self._state = self.create_parameter(
-                shape=[1], attr=state_attr, dtype=self._dtype)
-            self._state.stop_gradient = True
-            if self._layer is not None:
-                setattr(self._layer, "_quant_out_state", self._state)
-        else:
-            self._state = self._layer._quant_out_state
+        state_prefix = "{}.state".format(name) if name else 'outscale.state'
+        state_attr = ParamAttr(
+            name=unique_name.generate(state_prefix),
+            initializer=Constant(1),
+            trainable=False)
+        self._state = self.create_parameter(
+            shape=[1], attr=state_attr, dtype=dtype)
+        self._state.stop_gradient = True
 
-        if self._layer is None or not hasattr(self._layer, "_quant_out_accum"):
-            accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
-            accum_attr = ParamAttr(
-                name=unique_name.generate(accum_prefix),
-                initializer=Constant(1),
-                trainable=False)
-            self._accum = self.create_parameter(
-                shape=[1], attr=accum_attr, dtype=self._dtype)
-            self._accum.stop_gradient = True
-            if self._layer is not None:
-                setattr(self._layer, "_quant_out_accum", self._accum)
-        else:
-            self._accum = self._layer._quant_out_accum
+        accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
+        accum_attr = ParamAttr(
+            name=unique_name.generate(accum_prefix),
+            initializer=Constant(1),
+            trainable=False)
+        self._accum = self.create_parameter(
+            shape=[1], attr=accum_attr, dtype=dtype)
+        self._accum.stop_gradient = True
 
     def forward(self, input):
         if in_dygraph_mode():
@@ -567,18 +550,30 @@ class MovingAverageAbsMaxScale(layers.Layer):
                      not self.training)
             state = self._state if self.training else None
             accum = self._accum if self.training else None
+            quant_out = _varbase_creator(
+                type=input.type,
+                name="{}.tmp".format(input.name),
+                shape=input.shape,
+                dtype=input.dtype,
+                persistable=False)
 
-            self._scale, _, _ = core.ops.moving_average_abs_max_scale(
-                input, accum, state, self._scale, state, accum, *attrs)
-            return self._scale
+            out, _, _, _ = core.ops.moving_average_abs_max_scale(
+                input, accum, state, quant_out, self._scale, state, accum,
+                *attrs)
+            return out
 
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                  'MovingAverageAbsMaxScale')
 
         attrs = {'moving_rate': self._moving_rate, 'is_test': not self.training}
-
         inputs = {"X": [input]}
-        outputs = {"OutScale": [self._scale]}
+        quant_out = self._helper.create_variable(
+            name="{}.tmp".format(input.name),
+            dtype=input.dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=False)
+        outputs = {"Out": [quant_out], "OutScale": [self._scale]}
 
         if self.training:
             inputs['InState'] = [self._state]
@@ -592,4 +587,22 @@ class MovingAverageAbsMaxScale(layers.Layer):
             outputs=outputs,
             attrs=attrs)
 
-        return self._scale
+        return quant_out
+
+
+class QuantizedOutputLayer(layers.Layer):
+    def __init__(self, layer=None, moving_rate=0.9, dtype='float32'):
+        r"""
+        Add MovingAverageMaxScale layer to the behind of the input layer.
+        """
+        super(QuantizedOutputLayer, self).__init__()
+        self._layer = layer
+        self._moving_average_abs_max_scale = \
+            MovingAverageAbsMaxScale(layer.full_name(), moving_rate, dtype)
+
+    def forward(self, input):
+        if isinstance(input, list):
+            assert len(input) == 1, \
+                "The QuantizedOutputLayer should only have one input."
+        out = self._layer(input)
+        return self._moving_average_abs_max_scale(out)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index 3bf655265c..87ff6ae0af 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+<<<<<<< HEAD
 
 op_real_in_out_name = {
     "conv2d": [["Input", "Filter"], ["Output"]],
@@ -31,6 +32,11 @@ op_real_in_out_name = {
 }
 
 supported_quant_layers_map = {
+=======
+import numpy as np
+
+quant_input_layers_map = {
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
     'Conv2D': paddle.nn.Conv2D,
     'Linear': paddle.nn.Linear,
     'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
@@ -58,8 +64,68 @@ fake_quantize_dequantize_types = [
     "fake_quantize_dequantize_moving_average_abs_max"
 ]
 
+<<<<<<< HEAD
 out_scale_layers_list = (
     paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.MaxPool2D,
     paddle.nn.BatchNorm, paddle.nn.BatchNorm2D, paddle.nn.SyncBatchNorm,
     paddle.nn.LeakyReLU, paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6,
     paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Tanh, paddle.nn.Swish)
+=======
+quant_output_layers_map = {
+    'Conv2D': paddle.nn.Conv2D,
+    'Conv2DTranspose': paddle.nn.Conv2DTranspose,
+    'Linear': paddle.nn.Linear,
+    'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
+    'AdaptiveMaxPool2D': paddle.nn.AdaptiveMaxPool2D,
+    'AvgPool2D': paddle.nn.AvgPool2D,
+    'MaxPool2D': paddle.nn.MaxPool2D,
+    'BatchNorm': paddle.nn.BatchNorm,
+    'BatchNorm2D': paddle.nn.BatchNorm2D,
+    'SyncBatchNorm': paddle.nn.SyncBatchNorm,
+    'ELU': paddle.nn.ELU,
+    'GELU': paddle.nn.GELU,
+    'LeakyReLU': paddle.nn.LeakyReLU,
+    'PReLU': paddle.nn.PReLU,
+    'ReLU': paddle.nn.ReLU,
+    'ReLU6': paddle.nn.ReLU6,
+    'Sigmoid': paddle.nn.Sigmoid,
+    'Softmax': paddle.nn.Softmax,
+    'Tanh': paddle.nn.Tanh,
+    'Swish': paddle.nn.Swish,
+}
+
+weight_op_types = [
+    "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose",
+    "depthwise_conv2d_transpose"
+]
+
+
+def load_variable_data(scope, var_name):
+    '''
+    Load variable value from scope
+    '''
+    var_node = scope.find_var(var_name)
+    assert var_node is not None, \
+        "Can not find " + var_name + " in the scope."
+    return np.array(var_node.get_tensor())
+
+
+def find_previous_op(block, var_name):
+    """
+    Find the previous op for the input variable.
+    """
+    for op in block.ops:
+        if var_name in op.output_arg_names:
+            return op
+
+
+def find_next_ops(block, var_name):
+    """
+    Find all followed ops for the input variable.
+    """
+    res_ops = []
+    for op in block.ops:
+        if var_name in op.input_arg_names:
+            res_ops.append(op)
+    return res_ops
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index ed29375d22..b48c9ef7d8 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -33,7 +33,6 @@ from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, PReLU
 from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D
-from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph import nn
 
@@ -131,8 +130,8 @@ class ImperativeLenet(fluid.dygraph.Layer):
                 bias_attr=False),
             BatchNorm2D(6),
             ReLU(),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
+            MaxPool2D(
+                kernel_size=2, stride=2),
             Conv2D(
                 in_channels=6,
                 out_channels=16,
@@ -357,7 +356,6 @@ class TestImperativeOutSclae(unittest.TestCase):
                     "diff({}) at {}, dynamic loss = {}, static loss = {}".
                     format(diff, i, loss_d, loss_s))
                 break
-
         self.assertTrue(
             np.allclose(
                 np.array(dynamic_loss_rec),
@@ -398,10 +396,19 @@ class TestImperativeOutSclae(unittest.TestCase):
             if dynamic_ops[i].has_attr("out_threshold"):
                 op_count += 1
                 self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
+                if dynamic_ops[i].attr("out_threshold") != static_ops[i].attr(
+                        "out_threshold"):
+                    _logger.info(dynamic_ops[i].attr("out_threshold"))
+                    _logger.info(static_ops[i].attr("out_threshold"))
                 self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
                                 static_ops[i].attr("out_threshold"))
 
+<<<<<<< HEAD
         self.assertTrue(op_count == 13)
+=======
+        _logger.info("op_cout: {}".format(op_count))
+        self.assertTrue(op_count == 14)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
 
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
@@ -472,28 +479,8 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
                                 static_ops[i].attr("out_threshold"))
         self.assertTrue(op_count == 13)
 
-
-class TestSaveQuantizedModel_Warning(unittest.TestCase):
-    def test_warning(self):
-        path = "./dynamic_outscale_infer_model_with_warnings/lenet"
-        imperative_out_scale = ImperativeQuantAware()
-        with fluid.dygraph.guard():
-            lenet = ImperativeLenet()
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            imperative_out_scale.save_quantized_model(
-                layer=lenet,
-                path=path,
-                input_spec=[
-                    paddle.static.InputSpec(
-                        shape=[None, 1, 28, 28], dtype='float32')
-                ])
-
-        warning_message = "Warning: No Layer of the model while to be saved contains the out_threshold attribute, " \
-                "so the generated inference model would not contain the out_threshold."
-        num = get_vaild_warning_num(warning_message, w)
-        assert num == 1
+        _logger.info("op_cout: {}".format(op_count))
+        self.assertTrue(op_count == 14)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 96b3b67103..99a2352540 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import os
 import numpy as np
 import random
+import shutil
+import time
 import unittest
 import logging
 import paddle
@@ -157,6 +159,20 @@ class TestImperativeQat(unittest.TestCase):
     QAT = quantization-aware training
     """
 
+    @classmethod
+    def setUpClass(cls):
+        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        cls.root_path = os.path.join(os.getcwd(), "imperative_qat_" + timestamp)
+        cls.save_path = os.path.join(cls.root_path, "lenet")
+        cls.dynamic_root_path = os.path.join(os.getcwd(),
+                                             "dynamic_mnist_" + timestamp)
+        cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model")
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.root_path)
+        shutil.rmtree(cls.dynamic_root_path)
+
     def test_qat_save(self):
         imperative_qat = ImperativeQuantAware(
             weight_quantize_type='abs_max',
@@ -206,6 +222,8 @@ class TestImperativeQat(unittest.TestCase):
                             "Train | At epoch {} step {}: loss = {:}, acc= {:}".
                             format(epoch, batch_id,
                                    avg_loss.numpy(), acc.numpy()))
+                    if batch_id == 500:  # For shortening CI time
+                        break
 
                 lenet.eval()
                 for batch_id, data in enumerate(test_reader()):
@@ -242,11 +260,9 @@ class TestImperativeQat(unittest.TestCase):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./qat_infer_model/lenet"
-        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            path=path,
+            path=TestImperativeQat.save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -259,7 +275,7 @@ class TestImperativeQat(unittest.TestCase):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=save_dir,
+             dirname=TestImperativeQat.root_path,
              executor=exe,
              model_filename="lenet" + INFER_MODEL_SUFFIX,
              params_filename="lenet" + INFER_PARAMS_SUFFIX)
@@ -351,7 +367,7 @@ class TestImperativeQat(unittest.TestCase):
 
         paddle.jit.save(
             layer=lenet,
-            path="./dynamic_mnist/model",
+            path=TestImperativeQat.dynamic_save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
index d76e4825e0..f5b3e89ef4 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import os
 import numpy as np
 import random
+import shutil
+import time
 import unittest
 import logging
 import paddle
@@ -185,6 +187,21 @@ class ImperativeLenet(fluid.dygraph.Layer):
 
 
 class TestImperativeAddQuantDequant(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        cls.root_path = os.path.join(os.getcwd(),
+                                     "imperative_qat_aqd_" + timestamp)
+        cls.save_path = os.path.join(cls.root_path, "lenet")
+        cls.dynamic_root_path = os.path.join(os.getcwd(),
+                                             "dynamic_mnist_aqd_" + timestamp)
+        cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model")
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.root_path)
+        shutil.rmtree(cls.dynamic_root_path)
+
     def test_qat_save(self):
 
         imperative_qat = ImperativeQuantAware(
@@ -228,6 +245,8 @@ class TestImperativeAddQuantDequant(unittest.TestCase):
                             "Train | At epoch {} step {}: loss = {:}, acc= {:}".
                             format(epoch, batch_id,
                                    avg_loss.numpy(), acc.numpy()))
+                    if batch_id == 500:  # For shortening CI time
+                        break
 
                 lenet.eval()
                 for batch_id, data in enumerate(test_reader()):
@@ -264,11 +283,9 @@ class TestImperativeAddQuantDequant(unittest.TestCase):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./qat_infer_model/lenet"
-        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            path=path,
+            path=TestImperativeAddQuantDequant.save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -280,7 +297,7 @@ class TestImperativeAddQuantDequant(unittest.TestCase):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=save_dir,
+             dirname=TestImperativeAddQuantDequant.root_path,
              executor=exe,
              model_filename="lenet" + INFER_MODEL_SUFFIX,
              params_filename="lenet" + INFER_PARAMS_SUFFIX)
@@ -378,7 +395,7 @@ class TestImperativeAddQuantDequant(unittest.TestCase):
             lenet.eval()
         paddle.jit.save(
             layer=lenet,
-            path="./dynamic_mnist/model",
+            path=TestImperativeAddQuantDequant.dynamic_save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py
index ddc010d042..8e90b308b3 100644
--- a/python/paddle/fluid/dataloader/collate.py
+++ b/python/paddle/fluid/dataloader/collate.py
@@ -27,24 +27,31 @@ except:
 def default_collate_fn(batch):
     """
     Default batch collating function for :code:`paddle.io.DataLoader`,
-    batch should be a list of samples, and each sample should be a list
-    of fields as follows:
+    get input data as a list of sample datas, each element in list
+    if the data of a sample, and sample data should composed of list,
+    dictionary, string, number, numpy array and paddle.Tensor, this
+    function will parse input data recursively and stack number,
+    numpy array and paddle.Tensor datas as batch datas. e.g. for
+    following input data:
+
+    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 3},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 4},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
     
-    [[filed1, filed2, ...], [filed1, filed2, ...], ...]
     
-    This default collate function zipped each filed together and stack
-    each filed as the batch field as follows:
+    This default collate function zipped each number and numpy array
+    field together and stack each field as the batch field as follows:
+
+    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
 
-    [batch_filed1, batch_filed2, ...]
 
     Args:  
-        batch(list of list of numpy array|paddle.Tensor): the batch data, each fields
-              should be a numpy array, each sample should be a list of
-              fileds, and batch should be a list of sample.
+        batch(list of sample data): batch should be a list of sample data.
     
     Returns:
-        a list of numpy array|Paddle.Tensor: collated batch of input batch data,
-            fields data type as same as fields in each sample.
+        Batched data: batched each number, numpy array and paddle.Tensor
+                      in input data.
     """
     sample = batch[0]
     if isinstance(sample, np.ndarray):
@@ -75,6 +82,24 @@ def default_collate_fn(batch):
 
 
 def default_convert_fn(batch):
+    """
+    Default batch converting function for :code:`paddle.io.DataLoader`.
+    get input data as a list of sample datas, each element in list
+    if the data of a sample, and sample data should composed of list,
+    dictionary, string, number, numpy array and paddle.Tensor.
+
+    .. note::
+        This function is default :attr:`collate_fn` in **Distable
+        automatic batching** mode, for **Distable automatic batching**
+        mode, please ses :attr:`paddle.io.DataLoader`
+
+    Args:  
+        batch(list of sample data): batch should be a list of sample data.
+    
+    Returns:
+        Batched data: batched each number, numpy array and paddle.Tensor
+                      in input data.
+    """
     if isinstance(batch, (paddle.Tensor, np.ndarray)):
         return batch
     elif isinstance(batch, (str, bytes)):
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index b923f36af8..0f98af5772 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -427,7 +427,7 @@ class Section(DeviceWorker):
         section_param.schedule_mode = schedule_mode
         cfg = section_param.section_config
         program = pipeline_opt["section_program"]
-        cfg.program_desc.ParseFromString(program["program"]._get_desc()
+        cfg.program_desc.ParseFromString(program._get_desc()
                                          .serialize_to_string())
         # TODO: why does not work
         # cfg.program_desc.CopyFrom(program.program._get_desc())
diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index dd04b10720..e80bc1245f 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -213,13 +213,25 @@ class LayerList(Layer):
             for idx, layer in enumerate(sublayers):
                 self.add_sublayer(str(idx), layer)
 
+    def _get_abs_idx(self, idx):
+        if isinstance(idx, int):
+            if not (-len(self) <= idx < len(self)):
+                raise IndexError(
+                    'index {} is out of range, should be an integer in range [{}, {})'.
+                    format(idx, -len(self), len(self)))
+            if idx < 0:
+                idx += len(self)
+        return idx
+
     def __getitem__(self, idx):
         if isinstance(idx, slice):
             return self.__class__(list(self._sub_layers.values())[idx])
         else:
+            idx = self._get_abs_idx(idx)
             return self._sub_layers[str(idx)]
 
     def __setitem__(self, idx, sublayer):
+        idx = self._get_abs_idx(idx)
         return setattr(self, str(idx), sublayer)
 
     def __delitem__(self, idx):
@@ -227,6 +239,7 @@ class LayerList(Layer):
             for k in range(len(self._sub_layers))[idx]:
                 delattr(self, str(k))
         else:
+            idx = self._get_abs_idx(idx)
             delattr(self, str(idx))
         str_indices = [str(i) for i in range(len(self._sub_layers))]
         self._sub_layers = OrderedDict(
@@ -275,10 +288,15 @@ class LayerList(Layer):
                 another = paddle.nn.Linear(10, 10)
                 linears.insert(3, another)
                 print(linears[3] is another)  # True
+                another = paddle.nn.Linear(10, 10)
+                linears.insert(-1, another)
+                print(linears[-2] is another) # True
         """
         assert isinstance(index, int) and \
-               0 <= index < len(self._sub_layers), \
-            "index should be an integer in range [0, len(self))"
+               -len(self._sub_layers) <= index < len(self._sub_layers), \
+            "index should be an integer in range [{}, {})".format(-len(self), len(self))
+
+        index = self._get_abs_idx(index)
         for i in range(len(self._sub_layers), index, -1):
             self._sub_layers[str(i)] = self._sub_layers[str(i - 1)]
         self._sub_layers[str(index)] = sublayer
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index b7ef000938..bd89a79c80 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -594,7 +594,7 @@ class LoopTransformer(gast.NodeTransformer):
         # append return values for loop body
         body_stmts.append(
             gast.Return(value=generate_name_node(
-                loop_var_names, ctx=gast.Load())))
+                loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True)))
         body_func_node = gast.FunctionDef(
             name=unique_name.generate(FOR_BODY_PREFIX),
             args=gast.arguments(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 1071fc1350..624ca085ac 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -381,9 +381,15 @@ def get_attribute_full_name(node):
     return astor.to_source(gast.gast_to_ast(node)).strip()
 
 
-def generate_name_node(name_ids, ctx=gast.Load()):
+def generate_name_node(name_ids, ctx=gast.Load(), gen_tuple_if_single=False):
     """
-    Generate list or gast.Tuple of ast.Name for Return statement.
+    If name_ids is list or tuple or set with multiple strings, this function
+    generates gast.Tuple of gast.Name.
+    If the name_ids is single string or contains only 1 string, this function
+    returns gast.Name if gen_tuple_if_single==False else returns gast.Tuple
+    with only one gast.Name
+
+    This function is used at several gast.Return statements.
     """
     if isinstance(name_ids, six.string_types):
         name_ids = [name_ids]
@@ -395,7 +401,7 @@ def generate_name_node(name_ids, ctx=gast.Load()):
             id=name_id, ctx=ctx, annotation=None, type_comment=None)
         for name_id in name_ids
     ]
-    if len(gast_names) == 1:
+    if len(gast_names) == 1 and not gen_tuple_if_single:
         name_node = gast_names[0]
     else:
         name_node = gast.Tuple(elts=gast_names, ctx=ctx)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 9b0b04a6ea..da326ec074 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1458,7 +1458,7 @@ class Executor(object):
         dataset._prepare_to_run()
         real_fetch_list = []
         if program._pipeline_opt:
-            real_program = program._pipeline_opt["section_program"]['program']
+            real_program = program._pipeline_opt["section_program"]
             for fetch_var in fetch_list:
                 if isinstance(fetch_var, Variable):
                     fetch_var_name = fetch_var.name
@@ -1467,13 +1467,20 @@ class Executor(object):
                 if fetch_var_name in real_program.global_block().vars:
                     real_fetch_list.append(fetch_var)
 
-            program._pipeline_opt["section_program"][
-                'program'] = self._add_feed_fetch_ops(
-                    program=program._pipeline_opt["section_program"]['program'],
-                    feed=[],
-                    fetch_list=real_fetch_list,
-                    feed_var_name='feed',
-                    fetch_var_name='fetch')
+            program._pipeline_opt["section_program"] = self._add_feed_fetch_ops(
+                program=program._pipeline_opt["section_program"],
+                feed=[],
+                fetch_list=real_fetch_list,
+                feed_var_name='feed',
+                fetch_var_name='fetch')
+            main_block = program._pipeline_opt["section_program"].block(0)
+            for op in main_block.ops:
+                # set the op_role of fetch op to Optimize to avoid
+                # erase the fetched vars by gc for pipeline
+                if op.type == 'fetch':
+                    op._set_attr(
+                        'op_role',
+                        core.op_proto_and_checker_maker.OpRole.Optimize)
             fetch_list = None
 
         scope, trainer = self._prepare_trainer(
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index db487128bb..be795b9e59 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -53,7 +53,6 @@ __all__ = [
     'is_compiled_with_cuda',
     'is_compiled_with_xpu',
     'Variable',
-    'load_op_library',
     'require_version',
     'device_guard',
     'set_flags',
@@ -1863,6 +1862,7 @@ class Variable(object):
         if not isinstance(item, tuple):
             item = [item]
 
+        decrease_axes = []
         axes = []
         starts = []
         ends = []
@@ -1933,15 +1933,23 @@ class Variable(object):
                 if end is None:
                     end = max_integer if step > 0 else (0 - max_integer)
             else:
+                decrease_axes.append(dim)
                 start = slice_item
                 end = slice_item + 1 if slice_item != -1 else max_integer
                 step = 1
+
             axes.append(dim)
             starts.append(start)
             ends.append(end)
             steps.append(step)
 
-        attrs = {'axes': axes, 'starts': starts, 'ends': ends, 'steps': steps}
+        attrs = {
+            'axes': axes,
+            'starts': starts,
+            'ends': ends,
+            'steps': steps,
+            'decrease_axes': decrease_axes
+        }
 
         from .layers import utils
         if utils._contain_var(starts):
@@ -1997,7 +2005,8 @@ class Variable(object):
                 "paddle.Tensor to a paddle.Tensor, but received {}".format(
                     type(value)))
 
-        self.block.append_op(
+        cur_block = default_main_program().current_block()
+        cur_block.append_op(
             type="set_value", inputs=inputs, outputs={'Out': self}, attrs=attrs)
 
         return self
@@ -5761,33 +5770,6 @@ def _dygraph_place_guard(place):
         _set_dygraph_tracer_expected_place(tmp_place)
 
 
-def load_op_library(lib_filename):
-    """
-    :api_attr: Static Graph
-
-    Load a dynamic library, including custom operators and kernels.
-    When library is loaded, ops and kernels registered in the library
-    will be available in PaddlePaddle main process.
-    Please note, the type of custom operators can't have the same type
-    with the existing operators in the framework.
-
-    Args:
-        lib_filename (str): name of dynamic library.
-    
-    Returns:
-        list[str]: new registered custom op names.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            #fluid.load_op_library('custom_op.so')
-
-    """
-    core.load_op_library(lib_filename)
-    return OpProtoHolder.instance().update_op_proto()
-
-
 def switch_device(device):
     global _current_device
     pre_device = _current_device
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 00d1db19fc..6bc69ffd5c 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1603,6 +1603,10 @@ def conv2d(input,
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
+    if (core.is_compiled_with_cuda() and paddle.fluid.get_flags(
+            "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+        use_cudnn = False
+
     helper.append_op(
         type=l_type,
         inputs={
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 9c724cbfdd..37b2554b8a 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -3784,6 +3784,12 @@ class PipelineOptimizer(object):
                              "Optimizer, but the given type is {}.".format(
                                  type(optimizer)))
         self._optimizer = optimizer
+
+        # Get the original optimizer defined by users, such as SGD
+        self._origin_optimizer = self._optimizer
+        while hasattr(self._origin_optimizer, "inner_opt"):
+            self._origin_optimizer = self._origin_optimizer.inner_opt
+
         assert num_microbatches >= 1, (
             "num_microbatches must be a positive value.")
         self._num_microbatches = num_microbatches
@@ -3797,13 +3803,98 @@ class PipelineOptimizer(object):
         self._op_role_var_key = op_maker.kOpRoleVarAttrName()
         self._op_device_key = op_maker.kOpDeviceAttrName()
         self._param_device_map = None
+        self._pipeline_pair = []
+        self._pp_ring_map = dict()
+        self._global_ring_id = None
+
+    # insert allreduce op to sync global information for global
+    # gradient clip and amp
+    def _insert_allreduce_op(self, op_idx, block):
+        """
+        Insert allreduce op to sync global information for global
+        gradient clip and amp.
+        """
+        op = block.ops[op_idx]
+        out_name = op.desc.output_arg_names()[0]
+        out_var = block.var(out_name)
+        offset = 0
+        if op.type == "reduce_any":
+            # cast the bool var to int32 to use allreduce_max op
+            temp_var_name = unique_name.generate(out_name + "_cast_int32")
+            temp_var = block.create_var(
+                name=temp_var_name, shape=[1], dtype="int32")
+            block._insert_op(
+                op_idx + 1 + offset,
+                type='cast',
+                inputs={'X': out_var},
+                outputs={'Out': temp_var},
+                attrs={
+                    'in_dtype': out_var.dtype,
+                    'out_dtype': temp_var.dtype,
+                    self._op_role_key: self._op_role.Optimize
+                })
+            offset += 1
+        block._insert_op(
+            op_idx + 1 + offset,
+            type='c_allreduce_max'
+            if op.type == "reduce_any" else 'c_allreduce_sum',
+            inputs={'X': temp_var if op.type == "reduce_any" else out_var},
+            outputs={'Out': temp_var if op.type == "reduce_any" else out_var},
+            attrs={
+                'ring_id': self._global_ring_id,
+                self._op_role_key: self._op_role.Optimize,
+                'use_calc_stream': True
+            })
+        offset += 1
+        if op.type == "reduce_any":
+            block._insert_op(
+                op_idx + 1 + offset,
+                type='cast',
+                inputs={'X': temp_var},
+                outputs={'Out': out_var},
+                attrs={
+                    'in_dtype': temp_var.dtype,
+                    'out_dtype': out_var.dtype,
+                    self._op_role_key: self._op_role.Optimize
+                })
+        return offset
 
     def _create_vars(self, block, ori_block):
-        # Create vars for block, copied from main_program's global block
+        # Create vars for block, copied from ori_block
         used_var_set = set()
-        for op_idx in range(block.desc.op_size()):
-            op_desc = block.desc.op(op_idx)
-            vars = op_desc.input_arg_names() + op_desc.output_arg_names()
+        added_op_num = 0
+        op_idx = 0
+        op_size = block.desc.op_size()
+        while op_idx < op_size + added_op_num:
+            # Whether to insert allreduce_sum or allreduce_max op.
+            # For amp and global gradient clip strategies, we should
+            # get the global information, so allreduce op is needed.
+            should_insert = False
+            op = block.ops[op_idx]
+            # For op process vars on all devices, remove its input 
+            # vars not in this block
+            reserved_x = []
+            if op.type == 'reduce_any' and self._is_optimize_op(op):
+                should_insert = True
+            elif op.type == 'concat' and self._is_optimize_op(op):
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+            elif op.type == 'update_loss_scaling':
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+                op.desc.set_output('Out', reserved_x)
+            elif op.type == 'sum' and self._is_gradient_clip_op(op):
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+                should_insert = True
+
+            vars = op.desc.input_arg_names() + op.desc.output_arg_names()
             for var in vars:
                 # a var whose name contains "blocking_queue" 
                 # only exists in startup program 
@@ -3813,27 +3904,39 @@ class PipelineOptimizer(object):
                 if block._find_var_recursive(str(var)): continue
                 source_var = ori_block._var_recursive(str(var))
                 if source_var.type == core.VarDesc.VarType.READER:
-                    block.create_var(
+                    dest_var = block.create_var(
                         name=var,
                         type=core.VarDesc.VarType.READER,
                         persistable=source_var.persistable)
                 else:
-                    block._clone_variable(source_var, False)
+                    dest_var = block._clone_variable(source_var, False)
+                dest_var.stop_gradient = source_var.stop_gradient
+            # When use with sharding, allreduce_sum and allreduce_max
+            # used for global gradient clip and amp will be added by sharding.
+            op_idx += 1
+            if self.use_sharding or not should_insert: continue
+            inserted_ops = self._insert_allreduce_op(op_idx - 1, block)
+            added_op_num += inserted_ops
+            op_idx += inserted_ops
+        block._sync_with_cpp()
 
     def _is_loss_grad_op(self, op):
-        if self._op_role_key not in op.attr_names:
-            return False
-        op_role = int(op.all_attrs()[self._op_role_key])
+        assert self._op_role_key in op.attr_names
+        op_role = int(op.attr(self._op_role_key))
         return op_role & int(self._op_role.Backward) and op_role & int(
             self._op_role.Loss)
 
     def _is_backward_op(self, op):
-        return self._op_role_key in op.attr_names and int(op.all_attrs()[
-            self._op_role_key]) & int(self._op_role.Backward)
+        return self._op_role_key in op.attr_names and (
+            int(op.attr(self._op_role_key)) & int(self._op_role.Backward))
+
+    def _is_loss_op(self, op):
+        assert self._op_role_key in op.attr_names
+        return int(op.attr(self._op_role_key)) == int(self._op_role.Loss)
 
     def _is_optimize_op(self, op):
-        return self._op_role_key in op.attr_names and int(op.all_attrs()[
-            self._op_role_key]) & int(self._op_role.Optimize)
+        return self._op_role_key in op.attr_names and (
+            int(op.attr(self._op_role_key)) & int(self._op_role.Optimize))
 
     def _is_update_op(self, op):
         return 'Param' in op.input_names and 'Grad' in op.input_names and (
@@ -3842,50 +3945,40 @@ class PipelineOptimizer(object):
     def _split_program(self, main_program, devices):
         """
         Split a program into sections according to devices that ops run on.
-        The ops of the role LRSched are copied to all sections.
+        The op whose op_device attr is "gpu:all" is copied to all sections.
 
         Args:
             main_program (Program): the main program
             devices: all used devices
         """
-        programs = []
         # Map from device to its corresponding section program info
-        device_program_map = dict()
-        for device in devices:
-            p = {'program': Program()}
-            device_program_map[device] = p
+        device_program_map = defaultdict(Program)
 
         block = main_program.block(0)
         for op in block.ops:
             device = op.attr(self._op_device_key)
-            op_role = op.attr(self._op_role_key)
-            if int(op_role) & int(self._op_role.LRSched):
-                # Copy ops of the role LRSched to all sections.
-                for device in device_program_map.keys():
+            # Copy ops whose op_device set to "gpu:all" to all sections.
+            if device == "gpu:all":
+                for device in devices:
                     program = device_program_map[device]
                     op_desc = op.desc
-                    ap_op = program["program"].block(0).desc.append_op()
-                    ap_op.copy_from(op_desc)
-                    # ap_op._set_attr(self._op_device_key, "")
-            elif op.type == "create_py_reader" or op.type == "read" or op.type == "create_double_buffer_reader":
-                # Copy read related ops to all section to make them exit after each epoch.
-                for device in device_program_map.keys():
-                    program = device_program_map[device]
-                    op_desc = op.desc
-                    ap_op = program["program"].block(0).desc.append_op()
+                    ap_op = program.global_block().desc.append_op()
                     ap_op.copy_from(op_desc)
+                    ap_op._set_attr(self._op_device_key, "")
             else:
                 program = device_program_map[device]
                 op_desc = op.desc
-                ap_op = program["program"].block(0).desc.append_op()
+                ap_op = program.global_block().desc.append_op()
                 ap_op.copy_from(op_desc)
+                ap_op._set_attr(self._op_device_key, "")
 
+        program_list = []
         for key in devices:
             program = device_program_map[key]
-            program['program']._sync_with_cpp()
-            programs.append(program)
+            program._sync_with_cpp()
+            program_list.append(program)
 
-        return programs
+        return program_list
 
     def _get_op_device_for_startup_program(self, var_name):
         """
@@ -3894,21 +3987,22 @@ class PipelineOptimizer(object):
         get the real op_device attribute of the fill_constant as the device
         where the corresponding parameters on.
         """
-        assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name
+        assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name, \
+            'For accumulators for Adam, the name must contain beta1_pow_acc ' \
+            'or beta2_pow_acc.'
         param_name = var_name[0:var_name.index('_beta')]
         device = self._param_device_map[param_name]
         return device
 
-    def _split_startup_program(self, startup_program, local_rank):
-        block = startup_program.block(0)
+    def _split_startup_program(self, startup_program, device_id):
+        block = startup_program.global_block()
         new_startup_program = Program()
         for op in block.ops:
             device = op.attr(self._op_device_key)
             if device == "cpu":
                 assert op.type == "fill_constant", (
-                    "For ops in startup "
-                    "program that with the op_device attribute of cpu, "
-                    "they must be fill_constant.")
+                    "For ops in startup program with the op_device attribute "
+                    "of cpu, they must be of type fill_constant.")
                 output_var = op.output_arg_names[0]
                 device = self._get_op_device_for_startup_program(output_var)
 
@@ -3917,14 +4011,13 @@ class PipelineOptimizer(object):
             else:
                 # LR related ops
                 device = None
-            if device and device_index != local_rank: continue
+            if device and device_index != device_id: continue
             op_desc = op.desc
-            ap_op = new_startup_program.block(0).desc.append_op()
+            ap_op = new_startup_program.global_block().desc.append_op()
             ap_op.copy_from(op_desc)
             ap_op._set_attr(self._op_device_key, "")
         new_startup_program._sync_with_cpp()
-        self._create_vars(
-            new_startup_program.block(0), startup_program.global_block())
+        self._create_vars(new_startup_program.global_block(), block)
         return new_startup_program
 
     def _find_post_op(self, ops, cur_op, var_name):
@@ -3937,6 +4030,11 @@ class PipelineOptimizer(object):
                                var_name as output.
             var_name (string): Variable name.
         """
+        # To skip the cast op added by amp which has no op_device set
+        if '.cast_fp32' in var_name:
+            var_name = var_name.replace('.cast_fp32', '')
+        elif '.cast_fp16' in var_name:
+            var_name = var_name.replace('.cast_fp16', '')
         post_op = []
         before = True
         for op in ops:
@@ -3965,7 +4063,8 @@ class PipelineOptimizer(object):
         """
         prev_op = []
         for op in ops:
-            if op.type == 'send_v2' or op.type == 'recv_v2':
+            if op.type == 'send_v2' or op.type == 'recv_v2' \
+                or op.type == 'c_broadcast':
                 continue
             if op == cur_op:
                 break
@@ -3980,11 +4079,8 @@ class PipelineOptimizer(object):
         return None
 
     def _rename_arg(self, op, old_name, new_name):
-        op_desc = op.desc
-        if isinstance(op_desc, tuple):
-            op_desc = op_desc[0]
-        op_desc._rename_input(old_name, new_name)
-        op_desc._rename_output(old_name, new_name)
+        op._rename_input(old_name, new_name)
+        op._rename_output(old_name, new_name)
 
     def _create_var(self, block, ref_var, name):
         """
@@ -3998,99 +4094,12 @@ class PipelineOptimizer(object):
             dtype=ref_var.dtype,
             type=ref_var.type,
             lod_level=ref_var.lod_level,
-            persistable=False,
-            is_data=False,
+            persistable=ref_var.persistable,
+            is_data=ref_var.is_data,
             need_check_feed=ref_var.desc.need_check_feed())
+        new_var.stop_gradient = ref_var.stop_gradient
         return new_var
 
-    def _get_data_var_info(self, block):
-        """
-        Get info of all vars whose is_data attribute are true.
-        """
-        # map of data vars to devices that that data on
-        data_devices_map = dict()
-        for op in block.ops:
-            dev_spec = op.attr(self._op_device_key)
-            for var_name in op.input_arg_names:
-                if "blocking_queue" in var_name: continue
-                var = block.var(var_name)
-                if not var.is_data:
-                    continue
-                if not var_name in data_devices_map:
-                    data_devices_map[var_name] = []
-                if not dev_spec in data_devices_map[var_name]:
-                    data_devices_map[var_name].append(dev_spec)
-        return data_devices_map
-
-    def _insert_sendrecv_for_data_var(self, main_block, programs, startup,
-                                      devices):
-        """
-        Insert send and recv ops for data var that on other devices.
-
-        Args:
-            main_block (Block): Global block for main program
-            programs (dict): Dictionary for section params
-            startup (Program): Startup program
-            devices (list): List of devices in the format (dev:dev_index)
-        """
-        main_program = main_block.program
-        data_devices_map = self._get_data_var_info(main_block)
-
-        first_prog = programs[0]['program']
-        first_block = first_prog.block(0)
-        insert_index = 0
-        for op in first_block.ops:
-            insert_index += 1
-            if op.type == "read":
-                break
-        first_dev_spec = devices[0]
-        first_dev_index = int(first_dev_spec.split(':')[1])
-        for var_name in data_devices_map.keys():
-            for device in data_devices_map[var_name]:
-                if device == first_dev_spec: continue
-                main_var = main_block.var(var_name)
-                assert main_var.is_data
-                if not var_name in first_block.vars:
-                    self._create_var(first_block, main_var, var_name)
-                dev_index = int(device.split(':')[1])
-                first_block._insert_op(
-                    index=insert_index,
-                    type='send_v2',
-                    inputs={'X': first_block.var(var_name)},
-                    attrs={
-                        self._op_device_key: first_dev_spec,
-                        self._op_role_key: self._op_role.Forward,
-                        'use_calc_stream': True,
-                        'peer': dev_index,
-                    })
-                # Get the device that that data on
-                assert device in devices
-                prog_index = devices.index(device)
-                prog = programs[prog_index]['program']
-                block = prog.block(0)
-                index = 0
-                for op in block.ops:
-                    index += 1
-                    if op.type == "read":
-                        break
-                source_var = main_program.block(0).var(var_name)
-                new_var = self._create_var(block, source_var, var_name)
-                new_var_shape = list(new_var.shape)
-                new_var_shape[0] = self.micro_batch_size if new_var_shape[
-                    0] < 0 else new_var_shape[0]
-                block._insert_op(
-                    index=index,
-                    type='recv_v2',
-                    outputs={'Out': [new_var]},
-                    attrs={
-                        'out_shape': new_var_shape,
-                        'dtype': new_var.dtype,
-                        self._op_device_key: device,
-                        self._op_role_key: self._op_role.Forward,
-                        'peer': first_dev_index,
-                        'use_calc_stream': True,
-                    })
-
     def _strip_grad_suffix(self, name):
         """
         Strip the grad suffix from the given variable name
@@ -4104,95 +4113,161 @@ class PipelineOptimizer(object):
         """
         return name + core.grad_var_suffix()
 
-    def _add_opdevice_attr_for_regularization_clip(self, block):
+    def _get_op_device_attr(self, op):
         """
-        Add op_device attribute for regulization and clip ops.
+        Get the op_device attribute of a op.
         """
-        for op in block.ops:
-            # role for regularization and clip ops is optimize
-            if int(op.attr(self._op_role_key)) != int(self._op_role.Optimize):
-                continue
-            if op.has_attr(self._op_device_key) and (
-                    op.attr(self._op_device_key) != ""):
-                continue
-            assert self._op_role_var_key in op.attr_names
-            op_role_var = op.all_attrs()[self._op_role_var_key]
-            assert len(op_role_var) == 2
+        device = op.attr(self._op_device_key) \
+            if op.has_attr(self._op_device_key) else None
+        if device:
+            assert device[0:3] == 'gpu', "Now, only gpu devices are " \
+                "supported in pipeline parallemism."
+        return device
+
+    def _add_op_device_attr_for_op(self, op, idx, block):
+        """
+        Add op_device attrribute for ops that have not that attribute set.
+        We use "gpu:all" to represent the op should be put on all
+        sub-programs, such as lr-related ops. Note that: "gpu:all"
+        is only used by pipeline as an indicator.
+        """
+        lrsched_role = int(self._op_role.LRSched)
+        if op.attr(self._op_role_key) == lrsched_role:
+            # For LRSched ops, we should put them on all sub-programs to
+            # make sure each sub-program update the lr correctly
+            op._set_attr(self._op_device_key, "gpu:all")
+        elif (op.type == "cast" or
+              op.type == "scale") and self._is_backward_op(op):
+            prev_op = self._find_real_prev_op(block.ops, op,
+                                              op.desc.input("X")[0])
+            op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key))
+        elif op.type == "memcpy" and not self._is_optimize_op(op):
+            assert len(op.input_arg_names) == 1 and len(
+                op.output_arg_names) == 1
+            input_name = op.input_arg_names[0]
+            output_name = op.output_arg_names[0]
+            if '@Fetch' in output_name:
+                post_op = self._find_post_op(block.ops, op, output_name)
+                op._set_attr(self._op_device_key,
+                             post_op.attr(self._op_device_key))
+            else:
+                prev_op = self._find_real_prev_op(block.ops, op,
+                                                  op.desc.input("X")[0])
+                op._set_attr(self._op_device_key,
+                             prev_op.attr(self._op_device_key))
+        elif self._is_loss_op(op):
+            # For loss * loss_scaling op added by AMP
+            offset = 1
+            while (not block.ops[idx + offset].has_attr(self._op_device_key) or
+                   not block.ops[idx + offset].attr(self._op_device_key)):
+                offset += 1
+            device = block.ops[idx + offset].attr(self._op_device_key)
+            assert device, "Please put you program within device_guard scope."
+            for i in range(offset):
+                block.ops[idx + i]._set_attr(self._op_device_key, device)
+        elif self._is_optimize_op(op) and op.type == "check_finite_and_unscale":
+            op_role_var = op.attr(self._op_role_var_key)
             param_name = op_role_var[0]
             device = self._param_device_map[param_name]
             op._set_attr(self._op_device_key, device)
-
-    def _add_default_opdevice_attr(self, block):
+        elif self._is_optimize_op(op) and op.type == "cast":
+            # For fp16-->fp32 cast added by AMP
+            grad_name = op.output('Out')
+            assert len(grad_name) == 1
+            param_name = grad_name[0].strip(core.grad_var_suffix())
+            device = self._param_device_map[param_name]
+            op._set_attr(self._op_device_key, device)
+        elif self._is_gradient_clip_op(op) or self._is_regularization_op(op):
+            # For gradient clip and regularization ops, we set their op_device
+            # attribute to the device where their corresponding parameters on.
+            assert self._op_role_var_key in op.attr_names, "gradient_clip " \
+                "and regularization ops must have op_role_var attribute."
+            op_role_var = op.attr(self._op_role_var_key)
+            assert len(op_role_var) == 2, "op_role_var for gradient_clip " \
+                "regularization ops must have two elements."
+            param_name = op_role_var[0]
+            device = self._param_device_map[param_name]
+            # For sum op added by global gradient clip, it must be 
+            # put on all devices
+            if (op.type == 'sum' or op.type == 'sqrt' or
+                    op.type == 'fill_constant' or
+                    op.type == 'elementwise_max' or
+                    op.type == 'elementwise_div'):
+                device = "gpu:all"
+            op._set_attr(self._op_device_key, device)
+        else:
+            other_known_ops = [
+                'update_loss_scaling', 'reduce_any', 'concat', 'sum'
+            ]
+            assert op.type in other_known_ops, "For other ops without " \
+                "op_device set, they must be one of {}, but it " \
+                "is {}".format(other_known_ops, op.type)
+            assert self._is_optimize_op(op)
+            op._set_attr(self._op_device_key, "gpu:all")
+
+    def _add_op_device_attr(self, block):
         """
-        1. Add default op_device attribute for lr-related ops.
-           The default value is the one that of the first place.
-        2. Add default op_device attribute for sum ops added during
-           backward. For these ops, we set the op_device attribute
-           as the one of its post op, i.e, which op has the output of the
-           sum op as an input.
+        Add op_device attrribute for ops in block that have 
+        not that attribute set.
         """
-        first_devcie = ""
-
-        # Get the device spec of the first place.
-        # device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device,
-        # e.g. 'gpu:0', 'gpu:1', etc.
-        for op in block.ops:
-            if op.has_attr(self._op_device_key) and (
-                    op.attr(self._op_device_key) != ""):
-                first_device = op.attr(self._op_device_key)
-                break
-        assert first_device
-        first_device_type = first_device.split(":")[0]
-        assert first_device_type == "gpu"
-
-        # set op_device attr for lr-related ops
-        lrsched_role = int(self._op_role.LRSched)
-        for op in block.ops:
-            if not op.has_attr(self._op_device_key) or (
-                    op.attr(self._op_device_key) == ""):
-                if op.type == "sum":
-                    # For sum ops that compute the sum of @RENAMED@ vars
-                    for name in op.desc.input_arg_names():
-                        assert '@RENAME@' in name
-                    assert len(op.desc.output_arg_names()) == 1
-                    out_name = op.desc.output_arg_names()[0]
-                    post_op = self._find_post_op(block.ops, op, out_name)
-                    device = post_op.attr(self._op_device_key)
-                    assert device
-                    op._set_attr(self._op_device_key, device)
-                    continue
-
-                assert op.attr(self._op_role_key) == lrsched_role, (
-                    "Op whose op_device attr has not been set for pipeline"
-                    " must be of the role LRSched.")
-                op._set_attr(self._op_device_key, first_device)
+        for idx, op in enumerate(list(block.ops)):
+            if (op.type == "create_py_reader" or op.type == "read" or
+                    op.type == "create_double_buffer_reader"):
+                # Copy read related ops to all section to make them exit 
+                # after each epoch.
+                # We use "gpu:all" to represent the op should be put on all
+                # sub-programs, such as lr-related ops. Note that: "gpu:all"
+                # is only used by pipeline as an indicator.
+                op._set_attr(self._op_device_key, "gpu:all")
+                continue
+            # op_device attribute has been set
+            if self._get_op_device_attr(op): continue
+            self._add_op_device_attr_for_op(op, idx, block)
 
     def _check_validation(self, block):
         """
-        Check whether ops in a block are all validate (i.e., the 
-        op_device attribute has been set).
-        Then, return all device specifications in order.
+        Check whether ops in a block have both the op_device and the 
+        op_role attributes set.
+        Then, return all devices in order.
         """
-        device_specs = []
+        device_list = []
+        # Section worker only supports the following op_role
+        valid_op_role_value = [
+            int(self._op_role.LRSched),
+            int(self._op_role.Forward),
+            int(self._op_role.Backward),
+            int(self._op_role.Loss),
+            int(self._op_role.Optimize),
+            int(self._op_role.Backward) | int(self._op_role.Loss),
+        ]
         for op in block.ops:
-            type = op.type
-            if not op._has_kernel(type):
+            if not op._has_kernel(op.type):
                 assert op.type == "conditional_block" and (
                     op.attr(self._op_role_key) == int(self._op_role.LRSched)), (
                         "Now, the only supported op without kernel is "
                         "conditional_block, and its op role must be LRSched.")
+            assert op.has_attr(self._op_role_key), (
+                "op ({}) has no {} attribute.".format(op.type,
+                                                      self._op_role_key))
+            assert int(op.attr(self._op_role_key)) in valid_op_role_value, \
+                "op_role {} for op {} must be one of {}".format(
+                    op.attr(self._op_role_key),
+                    op.type,
+                    valid_op_role_value)
             assert op.has_attr(self._op_device_key), (
                 "op ({}) has no {} attribute.".format(op.type,
                                                       self._op_device_key))
-            dev_spec = op.attr(self._op_device_key)
-            assert dev_spec, ("op_device attribute for op "
-                              "{} has not been set.".format(op.type))
-            dev_type = dev_spec.split(':')[0]
+
+            device = op.attr(self._op_device_key)
+            assert device, ("op_device attribute for op "
+                            "{} has not been set.".format(op.type))
+            if device == "gpu:all": continue
+            dev_type = device.split(':')[0]
             assert dev_type == "gpu", ("Now only gpu devices are supported "
                                        "for pipeline parallelism.")
-            if not dev_spec in device_specs:
-                device_specs.append(dev_spec)
-        return device_specs
+            if not device in device_list:
+                device_list.append(device)
+        return device_list
 
     def _insert_sendrecv_ops_for_boundaries(self, block):
         """
@@ -4201,38 +4276,36 @@ class PipelineOptimizer(object):
         """
         extra_index = 0
 
-        # A map from var to device spec where op takes it as input,
+        # A map from var to device where op takes it as input,
         # avoiding multiple send and recv ops.
-        var_devspec = dict()
+        var_dev_map = dict()
 
         for index, op in enumerate(list(block.ops)):
-            # skips lr-related ops and vars, as we will process them later.
-            if int(op.attr(self._op_role_key)) & int(self._op_role.LRSched):
-                continue
-            # skips update ops and vars, as we will process them later.
-            if self._is_update_op(op): continue
-
-            cur_device_spec = op.attr(self._op_device_key)
+            cur_device = op.attr(self._op_device_key)
+            if cur_device == "gpu:all": continue
             for var_name in op.input_arg_names:
                 # i.e., lod_tensor_blocking_queue created by DataLoader,
                 # which only exists in startup program.
-                if not var_name in block.vars: continue
                 var = block.var(var_name)
                 # skip data, because we will process it later
                 if var.is_data: continue
+                prev_device = None
+                if var_name in self._param_device_map:
+                    prev_device = self._param_device_map[var_name]
                 prev_op = self._find_real_prev_op(block.ops, op, var_name)
-                if prev_op is None:
-                    continue
-                prev_device_spec = prev_op.attr(self._op_device_key)
+                if not prev_device:
+                    prev_device = prev_op.attr(self._op_device_key) \
+                        if prev_op else None
+                if not prev_device or prev_device == 'gpu:all': continue
 
-                if prev_device_spec != cur_device_spec:
-                    if var_name not in var_devspec:
-                        var_devspec[var_name] = []
-                    if cur_device_spec in var_devspec[var_name]: continue
-                    var_devspec[var_name].append(cur_device_spec)
+                if prev_device != cur_device:
+                    if var_name not in var_dev_map: var_dev_map[var_name] = []
+                    if cur_device in var_dev_map[var_name]: continue
+                    var_dev_map[var_name].append(cur_device)
 
                     op_role = op.all_attrs()[self._op_role_key]
                     var = block.vars[var_name]
+<<<<<<< HEAD
                     prev_device_index = int(prev_device_spec.split(':')[1])
                     cur_device_index = int(cur_device_spec.split(':')[1])
                     block._insert_op(
@@ -4287,62 +4360,240 @@ class PipelineOptimizer(object):
                     # a trick to run this op once per mini-batch
                     self._op_role_key: self._op_role.Optimize.LRSched,
                 })
+=======
+                    prev_device_index = int(prev_device.split(':')[1])
+                    cur_device_index = int(cur_device.split(':')[1])
+                    pair = (prev_device_index, cur_device_index)
+                    pair_key = prev_device_index * 1000 + cur_device_index
+                    if pair not in self._pipeline_pair:
+                        self._pipeline_pair.append(pair)
+                        self._pp_ring_map[pair_key] = self.ring_id
+                        ring_id = self.ring_id
+                        self.ring_id += 1
+                    else:
+                        ring_id = self._pp_ring_map[pair_key]
+                    if self.schedule_mode == 'F-then-B':  # F-then-B
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='send_v2',
+                            inputs={'X': var},
+                            attrs={
+                                self._op_device_key: prev_device,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': True,
+                                'peer': 1,
+                                'ring_id': ring_id
+                            })
+                        extra_index += 1
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='recv_v2',
+                            outputs={'Out': [var]},
+                            attrs={
+                                'out_shape': var.shape,
+                                'dtype': var.dtype,
+                                self._op_device_key: cur_device,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': True,
+                                'peer': 0,
+                                'ring_id': ring_id
+                            })
+                        extra_index += 1
+                    elif self.schedule_mode == '1F1B':  # 1F1B
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='c_sync_calc_stream',
+                            inputs={'X': [var]},
+                            outputs={'Out': [var]},
+                            attrs={
+                                self._op_device_key: prev_device,
+                                self._op_role_key: op_role,
+                            })
+                        extra_index += 1
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='send_v2',
+                            inputs={'X': var},
+                            attrs={
+                                self._op_device_key: prev_device,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': False,
+                                'ring_id': ring_id,
+                                'peer': 1,
+                            })
+                        extra_index += 1
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='c_sync_comm_stream',
+                            inputs={'X': [var]},
+                            outputs={'Out': [var]},
+                            attrs={
+                                self._op_device_key: prev_device,
+                                self._op_role_key: self._op_role.Backward,
+                                'ring_id': ring_id,
+                            })
+                        extra_index += 1
+                        var_shape = list(var.shape)
+                        var_shape[0] = self.micro_batch_size if var_shape[
+                            0] < 0 else var_shape[0]
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='recv_v2',
+                            outputs={'Out': [var]},
+                            attrs={
+                                'out_shape': var_shape,
+                                'dtype': var.dtype,
+                                self._op_device_key: cur_device,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': True,
+                                'peer': 0,
+                                'ring_id': ring_id
+                            })
+                        extra_index += 1
+                    else:
+                        raise ValueError(
+                            "Now only 'F-then-B' and '1F1B' are supported."
+                            "The given value is {}.".format(self.schedule_mode))
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
-    def _accumulate_gradients(self, block):
+    def _insert_loss_scale(self, block):
         """
-        Accumulate the gradients generated in microbatch to the one in mini-batch.
-        We also scale the loss corresponding to number of micro-batches as well.
+        Scale the loss corresponding to number of micro-batches.
         """
+        if self._num_microbatches == 1: return
         for index, op in reversed(tuple(enumerate(list(block.ops)))):
-            offset = index
-            device = op.attr(self._op_device_key)
-
-            # Backward pass
             if self._is_loss_grad_op(op):
                 loss_grad_var = block.vars[op.output_arg_names[0]]
-                scale_factor = self._num_microbatches
                 block._insert_op(
                     index=index + 1,
                     type='scale',
                     inputs={'X': loss_grad_var},
                     outputs={'Out': loss_grad_var},
                     attrs={
-                        'scale': 1.0 / scale_factor,
-                        self._op_device_key: device,
+                        'scale': 1.0 / self._num_microbatches,
                         self._op_role_key: self._op_role.Backward
                     })
                 break
-            if self._is_backward_op(op) and (
-                    self._op_role_var_key in op.attr_names):
-                op_role_var = op.all_attrs()[self._op_role_var_key]
 
-                if len(op_role_var) == 0:
+    def _rename_gradient_var_name(self, block):
+        for index, op in enumerate(block.ops):
+            if not self._is_optimize_op(op): continue
+            input_names = op.input_arg_names
+            output_names = op.output_arg_names
+            in_out_names = input_names + output_names
+            if op.type == 'cast': continue
+            # append "MERGED" to the names of parameter gradients,
+            # and mofify the op_role_var attribute (by rename_arg func).
+            for name in in_out_names:
+                if not core.grad_var_suffix() in name: continue
+                param_name = name.strip(core.grad_var_suffix())
+                new_grad_name = name + "@MERGED"
+                self._rename_arg(op, name, new_grad_name)
+
+    def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False):
+        """
+        Create a new merged gradient for each parameter and accumulate the
+        corresponding gradient to it.
+        """
+        merged_gradient_names = []
+        first_opt_op_idx = None
+
+        for index, op in reversed(tuple(enumerate(list(block.ops)))):
+            # remove the cast op of fp16 grad to fp32 grad
+            if self._is_optimize_op(op) and op.type == 'cast':
+                in_name = op.input_arg_names[0]
+                out_name = op.output_arg_names[0]
+                if out_name.strip('@GRAD') in self._param_device_map:
+                    assert in_name.replace('.cast_fp16', '') == out_name
+                    block._remove_op(index)
                     continue
+
+            if self._is_backward_op(op) and not first_opt_op_idx:
+                first_opt_op_idx = index + 1
+                # no optimize phase
+                if first_opt_op_idx == len(block.ops): return
+                if block.ops[first_opt_op_idx].type == "c_sync_comm_stream":
+                    first_opt_op_idx += 1
+
+            if self._is_backward_op(op) and (
+                    self._op_role_var_key in op.attr_names):
+                op_role_var = op.attr(self._op_role_var_key)
+                if len(op_role_var) == 0: continue
                 assert len(op_role_var) % 2 == 0
-                offset = index
                 for i in range(0, len(op_role_var), 2):
-                    grad_name = op_role_var[i + 1]
-                    grad_var = block.vars[grad_name]
-                    new_grad_var_name = unique_name.generate(grad_name)
-                    new_var = self._create_var(block, grad_var,
-                                               new_grad_var_name)
-                    self._rename_arg(op, grad_name, new_grad_var_name)
+                    offset = 0
+                    param_name = op_role_var[i]
+                    if not block.has_var(param_name): continue
+                    if '@BroadCast' in param_name: continue
+                    param_grad_name = param_name + core.grad_var_suffix()
+                    merged_param_grad_name = param_grad_name + '@MERGED'
+                    if not block.has_var(merged_param_grad_name):
+                        self._create_var(block, block.vars[param_name],
+                                         merged_param_grad_name)
+                    assert block.has_var(merged_param_grad_name)
+                    param_grad_var = block.var(param_grad_name)
+                    merged_param_grad_var = block.var(merged_param_grad_name)
+                    merged_param_grad_var.persistable = True
                     block._insert_op(
-                        index=offset + 1,
-                        type='sum',
-                        inputs={'X': [grad_var, new_var]},
-                        outputs={'Out': grad_var},
+                        index=first_opt_op_idx + offset,
+                        type='fill_constant',
+                        inputs={},
+                        outputs={'Out': [merged_param_grad_var]},
                         attrs={
-                            self._op_device_key: device,
-                            self._op_role_key: self._op_role.Backward,
-                            self._op_role_var_key: op_role_var
+                            'shape': merged_param_grad_var.shape,
+                            'dtype': merged_param_grad_var.dtype,
+                            'value': float(0),
+                            # a trick to run this op once per mini-batch
+                            self._op_role_key: self._op_role.Optimize.LRSched,
                         })
                     offset += 1
+                    grad_name = op_role_var[i + 1]
+                    grad_var = block.vars[grad_name]
+                    if not 'cast_fp16' in grad_name:
+                        block._insert_op(
+                            index=first_opt_op_idx + offset,
+                            type='sum',
+                            inputs={'X': [grad_var, merged_param_grad_var]},
+                            outputs={'Out': merged_param_grad_var},
+                            attrs={
+                                self._op_role_key: self._op_role.Backward,
+                            })
+                        offset += 1
+                        merged_gradient_names.append(merged_param_grad_name)
+                    else:
+                        # cast gradient to fp32 to accumulate to merged gradient
+                        cast_grad_var_name = param_grad_name + '@TMP'
+                        cast_grad_var = self._create_var(block, param_grad_var,
+                                                         cast_grad_var_name)
+                        cast_grad_var.persistable = False
+                        block._insert_op(
+                            index=first_opt_op_idx + offset,
+                            type='cast',
+                            inputs={'X': grad_var},
+                            outputs={'Out': cast_grad_var},
+                            attrs={
+                                'in_dtype': grad_var.dtype,
+                                'out_dtype': cast_grad_var.dtype,
+                                self._op_role_key: self._op_role.Backward,
+                            })
+                        offset += 1
+                        block._insert_op(
+                            index=first_opt_op_idx + offset,
+                            type='sum',
+                            inputs={
+                                'X': [merged_param_grad_var, cast_grad_var]
+                            },
+                            outputs={'Out': merged_param_grad_var},
+                            attrs={
+                                self._op_role_key: self._op_role.Backward,
+                            })
+                        offset += 1
+                        merged_gradient_names.append(merged_param_grad_name)
+        return merged_gradient_names
 
     def _add_sub_blocks(self, main_block, program_list):
         main_program = main_block.program
-        for prog_info in program_list:
-            prog = prog_info['program']
+        for prog in program_list:
             for op in prog.block(0).ops:
                 if not op.has_attr('sub_block'):
                     continue
@@ -4372,8 +4623,7 @@ class PipelineOptimizer(object):
         # var_info = {var_name: [program1, program2...]},
         # persistable var only
         var_info = dict()
-        for prog_info in program_list:
-            prog = prog_info['program']
+        for prog in program_list:
             block = prog.block(0)
             for var_name in block.vars:
                 if var_name == "double_buffer_0": continue
@@ -4395,7 +4645,7 @@ class PipelineOptimizer(object):
                 block = prog.block(0)
                 for op in block.ops:
                     if op.type == "recv_v2" or op.type == "create_py_reader" or \
-                        op.type == "read":
+                        op.type == "read" or op.type == "update_loss_scaling":
                         continue
                     # We have processed lr related vars
                     if op.attr(self._op_role_key) == int(
@@ -4423,6 +4673,15 @@ class PipelineOptimizer(object):
                 read_block = prog.block(0)
                 read_device = self._get_device_info(read_block)
                 read_dev_index = int(read_device.split(':')[1])
+                pair = (write_dev_index, read_dev_index)
+                pair_key = write_dev_index * 1000 + read_dev_index
+                if pair not in self._pipeline_pair:
+                    self._pipeline_pair.append(pair)
+                    self._pp_ring_map[pair_key] = self.ring_id
+                    ring_id = self.ring_id
+                    self.ring_id += 1
+                else:
+                    ring_id = self._pp_ring_map[pair_key]
 
                 write_block._insert_op(
                     index=0,
@@ -4430,11 +4689,12 @@ class PipelineOptimizer(object):
                     inputs={'X': write_block.var(var_name), },
                     attrs={
                         self._op_device_key: write_device,
-                        'use_calc_stream': True,
+                        'use_calc_stream': False,
                         # A trick to make the role LRSched to avoid copy every
                         # microbatch
                         self._op_role_key: self._op_role.LRSched,
                         'peer': read_dev_index,
+                        'ring_id': ring_id
                     })
                 read_block._insert_op(
                     index=0,
@@ -4444,36 +4704,68 @@ class PipelineOptimizer(object):
                         'out_shape': read_block.var(var_name).shape,
                         'dtype': read_block.var(var_name).dtype,
                         self._op_device_key: read_device,
-                        'use_calc_stream': True,
+                        'use_calc_stream': False,
+                        # A trick to make the role LRSched to avoid copy every
+                        # microbatch
+                        self._op_role_key: self._op_role.LRSched,
+                        'peer': write_dev_index,
+                        'ring_id': ring_id
+                    })
+                read_block._insert_op(
+                    index=1,
+                    type='c_sync_comm_stream',
+                    inputs={'X': [read_block.var(var_name)]},
+                    outputs={'Out': [read_block.var(var_name)]},
+                    attrs={
+                        self._op_device_key: read_device,
                         # A trick to make the role LRSched to avoid copy every
                         # microbatch
                         self._op_role_key: self._op_role.LRSched,
-                        'peer': write_dev_index
+                        'ring_id': ring_id
                     })
 
+    def _is_gradient_clip_op(self, op):
+        return op.desc.has_attr("op_namescope") \
+            and op.desc.attr("op_namescope").startswith("/gradient_clip")
+
+    def _is_regularization_op(self, op):
+        return op.desc.has_attr("op_namescope") \
+            and op.desc.attr("op_namescope").startswith("/regularization")
+
     def minimize(self,
                  loss,
                  startup_program=None,
                  parameter_list=None,
                  no_grad_set=None):
         main_block = loss.block
+        self.origin_main_block = main_block
         if startup_program is None:
             startup_program = default_startup_program()
         optimize_ops, params_grads = self._optimizer.minimize(
             loss, startup_program, parameter_list, no_grad_set)
-        self._param_device_map = self._optimizer._param_device_map
+        self._param_device_map = self._origin_optimizer._param_device_map
+        assert main_block.program._pipeline_opt \
+            and 'local_rank' in main_block.program._pipeline_opt, \
+            'Please use pipeline with fleet.'
+        local_rank = main_block.program._pipeline_opt['local_rank']
+        self._global_ring_id = main_block.program._pipeline_opt[
+            'global_ring_id']
+        schedule_mode = 0
+        if 'schedule_mode' in main_block.program._pipeline_opt:
+            schedule_mode = main_block.program._pipeline_opt['schedule_mode']
+        self.schedule_mode = schedule_mode
+        # micro batch size
         self.micro_batch_size = main_block.program._pipeline_opt[
             'micro_batch_size']
 
-        # Step1: add default op_device attribute for regulization and clip ops
-        self._add_opdevice_attr_for_regularization_clip(main_block)
-
-        # Step2: add default op_device attribute for ops whose op_device
-        # attribute have not been set yet. Then check all ops have the
-        # op_device attribute.
-        self._add_default_opdevice_attr(main_block)
+        self.use_sharding = False
+        if 'use_sharding' in main_block.program._pipeline_opt:
+            self.use_sharding = main_block.program._pipeline_opt['use_sharding']
+        self.ring_id = main_block.program._pipeline_opt['ring_id']
 
-        device_specs = self._check_validation(main_block)
+        # Step1: add default op_device attribute for ops.
+        self._add_op_device_attr(main_block)
+        device_list = self._check_validation(main_block)
 
         def device_cmp(device1, device2):
             dev1_id = int(device1.split(':')[1])
@@ -4485,33 +4777,29 @@ class PipelineOptimizer(object):
             else:
                 return 0
 
-        sorted_device_spec = sorted(device_specs, key=cmp_to_key(device_cmp))
-        assert sorted_device_spec == device_specs, (
-            "With pipeline "
-            "parallelism, you must use gpu devices one after another "
-            "in the order of their ids.")
-
-        # Step3: add send and recv ops between section boundaries
+        sorted_device_list = sorted(device_list, key=cmp_to_key(device_cmp))
+        assert sorted_device_list == device_list, (
+            "With pipeline parallelism, you must use gpu devices one after "
+            "another in the order of their ids.")
+        # Step2: add send and recv ops between section boundaries
         self._insert_sendrecv_ops_for_boundaries(main_block)
 
-        # Step4: split program into sections and add pairs of
+        # Step3: split program into sections and add pairs of
         # send and recv ops for data var.
         main_program = main_block.program
-        program_list = self._split_program(main_program, device_specs)
+        program_list = self._split_program(main_program, device_list)
         for p in program_list:
-            self._create_vars(p["program"].block(0),
-                              main_program.global_block())
-        self._insert_sendrecv_for_data_var(main_block, program_list,
-                                           startup_program, device_specs)
+            self._create_vars(p.global_block(), main_block)
 
-        # Step5: Special Case: process persistable vars that exist in
+        # Step4: Special Case: process persistable vars that exist in
         # multiple sections
         self._process_persistable_vars_in_multi_sections(
             main_program, startup_program, program_list)
 
-        # Step6: Add sub blocks for section programs
+        # Step5: Add sub blocks for section programs
         self._add_sub_blocks(main_block, program_list)
 
+<<<<<<< HEAD
         assert (main_program._pipeline_opt and
                 isinstance(main_program._pipeline_opt, dict) and
                 'local_rank' in main_program._pipeline_opt), \
@@ -4520,35 +4808,45 @@ class PipelineOptimizer(object):
             device_specs)
         self.schedule_mode = main_program._pipeline_opt['schedule_mode']
 
+=======
+        local_rank = main_program._pipeline_opt['local_rank'] % len(device_list)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
         place_list = []
-        for dev_spec in device_specs:
-            dev_index = dev_spec.split(":")[1]
-            place_list.append(core.CUDAPlace(local_rank))
+        for dev in device_list:
+            dev_index = int(dev.split(":")[1])
+            place_list.append(core.CUDAPlace(dev_index % 8))
 
-        # Step7: Split startup program
+        # Step6: Split startup program
         new_startup_program = self._split_startup_program(startup_program,
                                                           local_rank)
 
-        # Step8: clear gradients before each mini-batch and 
-        # accumulate gradients during backward
-        self._clear_gradients(
-            program_list[local_rank]['program'].global_block(),
-            dev_spec=device_specs[local_rank])
-        self._accumulate_gradients(program_list[local_rank]['program']
-                                   .global_block())
-
         startup_program._pipeline_opt = {
             "startup_program": new_startup_program,
         }
+        real_block = program_list[local_rank].global_block()
+        self._insert_loss_scale(real_block)
+        if not self.use_sharding:
+            # Step7: clear gradients before each mini-batch and 
+            # accumulate gradients during backward
+            self._rename_gradient_var_name(real_block)
+            real_block._sync_with_cpp()
+            self._accumulate_gradients(real_block)
+            real_block._sync_with_cpp()
 
         place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         main_program._pipeline_opt = {
             "trainer": "PipelineTrainer",
             "device_worker": "Section",
             "pipeline_stage": local_rank,
+<<<<<<< HEAD
             "num_pipeline_stages": len(device_specs),
             "schedule_mode": self.schedule_mode,
             "inner_parallelism": len(device_specs),
+=======
+            "num_pipeline_stages": len(device_list),
+            "schedule_mode": self.schedule_mode,
+            "inner_parallelism": len(device_list),
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
             "section_program": program_list[local_rank],
             "place": place_list[local_rank],
             "place_id": place_id,
@@ -4556,7 +4854,7 @@ class PipelineOptimizer(object):
             "num_microbatches": self._num_microbatches,
             "start_cpu_core_id": self._start_cpu_core_id,
         }
-        return optimize_ops, params_grads, program_list
+        return optimize_ops, params_grads, program_list, self._pipeline_pair, self._pp_ring_map
 
 
 class RecomputeOptimizer(Optimizer):
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index be196b73ed..9f2b2127aa 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -165,6 +165,12 @@ class DataLoader(object):
 
     For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
 
+    .. note::
+        GPU tensor operation is not supported in subprocess currently,
+        please don't use GPU tensor operations in pipeline which will
+        be performed in subprocess, such as dataset transforms, collte_fn,
+        etc. Numpy array and CPU tensor operation is supported.
+
     **Disable automatic batching**
 
     In certain cases such as some NLP tasks, instead of automatic batching,
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 899d6ae7f0..1d40415141 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -9,7 +9,8 @@ endforeach()
 add_subdirectory(unittests)
 add_subdirectory(book)
 
-# TODO: support New Custom OP on Mac
+# 2.0 New custom OP can support Windows/Linux now
+# TODO: support 2.0 New Custom OP on Mac
 if(NOT APPLE)
   add_subdirectory(custom_op)
 endif()
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index 36496ec499..1ecaff458d 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -1,6 +1,6 @@
 # New custom OP can support Windows/Linux now
 if(WITH_GPU)
-    # 'test_custom_relu_op_setup/jit' compile .cc and .cu file
+    # GPU custom op tests: compile both .cc and .cu file
     py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py)
     py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py)
     py_test(test_custom_relu_model SRCS test_custom_relu_model.py)
@@ -11,8 +11,11 @@ if(WITH_GPU)
     set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180)
 endif()
 
+<<<<<<< HEAD
 py_test(test_sysconfig SRCS test_sysconfig.py)
 
+=======
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 # CPU custom op tests: only compile .cc file
 py_test(test_dispatch_jit SRCS test_dispatch_jit.py)
 py_test(test_multi_out_jit SRCS test_multi_out_jit.py)
@@ -21,41 +24,9 @@ py_test(test_custom_concat SRCS test_custom_concat.py)
 py_test(test_custom_conj SRCS test_custom_conj.py)
 
 # other tests
+<<<<<<< HEAD
+=======
+py_test(test_sysconfig SRCS test_sysconfig.py)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 py_test(test_check_abi SRCS test_check_abi.py)
 cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
-
-if(NOT LINUX)
-    return()
-endif()
-
-# Old custom OP only support Linux, only run on Linux
-py_test(test_custom_op SRCS test_custom_op.py)
-py_test(test_jit_load SRCS test_jit_load.py)
-py_test(test_setup_install SRCS test_setup_install.py)
-py_test(test_setup_build SRCS test_setup_build.py)
-
-set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_install PROPERTIES TIMEOUT 250)
-set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
-
-
-if(WITH_ROCM)
-    hip_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared)
-elseif(WITH_GPU)
-    nv_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared)
-else()
-    cc_library(relu_op_shared SHARED SRCS relu_op.cc DEPS paddle_framework_shared)
-endif()
-set_target_properties(relu_op_shared PROPERTIES OUTPUT_NAME relu2_op)
-target_link_libraries(relu_op_shared ${FLUID_FRAMEWORK_SHARED_LIB})
-
-# remove the linked glog and gflags when compling relu_op_shared
-# otherwise, there is running error:
-# ERROR: something wrong with flag 'logtostderr' in file
-# 'third_party/glog/src/extern_glog/src/logging.cc'.
-# One possibility: file 'third_party/glog/src/extern_glog/src/logging.cc'
-# is being linked both statically and dynamically into this executable.
-get_target_property(TARGET_LIBRARIES relu_op_shared LINK_LIBRARIES)
-LIST(REMOVE_ITEM TARGET_LIBRARIES glog)
-LIST(REMOVE_ITEM TARGET_LIBRARIES gflags)
-set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES  ${TARGET_LIBRARIES} )
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index c0b30a1cb5..c075d27f7b 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -38,9 +38,8 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
 }
 
 std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
-  out.reshape(x.shape());
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
diff --git a/python/paddle/fluid/tests/custom_op/relu_op.cc b/python/paddle/fluid/tests/custom_op/relu_op.cc
deleted file mode 100644
index 837f5bab6b..0000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class Relu2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Y", in_dims);
-  }
-};
-
-class Relu2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor.");
-    AddOutput("Y", "Output of relu_op");
-    AddComment(R"DOC(
-Relu2 Operator.
-)DOC");
-  }
-};
-
-class Relu2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
-  }
-};
-
-template <typename T>
-class Relu2GradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("relu2_grad");
-    op->SetInput("Y", this->Output("Y"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class Relu2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < in_t->numel(); ++i) {
-      y[i] = std::max(static_cast<T>(0.), x[i]);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Relu2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    for (int i = 0; i < y_t->numel(); ++i) {
-      dx[i] = dy[i] * (y[i] > static_cast<T>(0) ? 1. : 0.);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OPERATOR(relu2,
-                  ops::Relu2Op,
-                  ops::Relu2OpMaker,
-                  ops::Relu2GradMaker<paddle::framework::OpDesc>,
-                  ops::Relu2GradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(relu2_grad, ops::Relu2GradOp);
-REGISTER_OP_CPU_KERNEL(relu2,
-                       ops::Relu2Kernel<CPU, float>,
-                       ops::Relu2Kernel<CPU, double>);
-REGISTER_OP_CPU_KERNEL(relu2_grad,
-                       ops::Relu2GradKernel<CPU, float>,
-                       ops::Relu2GradKernel<CPU, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op.cu b/python/paddle/fluid/tests/custom_op/relu_op.cu
deleted file mode 100644
index 53ad75e413..0000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeRelu2(const T* x, const int num, T* y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    y[i] = max(x[i], static_cast<T>(0.));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = in_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu2<T><<<grid, block, 0, dev_ctx.stream()>>>(x, num, y);
-  }
-};
-
-template <typename T>
-__global__ void KeRelu2Grad(const T* y, const T* dy, const int num, T* dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu2GradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = dy_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu2Grad<T><<<grid, block, 0, dev_ctx.stream()>>>(y, dy, num, dx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(relu2,
-                        paddle::operators::Relu2CUDAKernel<CUDA, float>,
-                        paddle::operators::Relu2CUDAKernel<CUDA, double>);
-
-REGISTER_OP_CUDA_KERNEL(relu2_grad,
-                        paddle::operators::Relu2GradCUDAKernel<CUDA, float>,
-                        paddle::operators::Relu2GradCUDAKernel<CUDA, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op3.cc b/python/paddle/fluid/tests/custom_op/relu_op3.cc
deleted file mode 100644
index ace9598c58..0000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op3.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class Relu3Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Y", in_dims);
-  }
-};
-
-class Relu3OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor.");
-    AddOutput("Y", "Output of relu_op");
-    AddComment(R"DOC(
-Relu3 Operator.
-)DOC");
-  }
-};
-
-class Relu3GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
-  }
-};
-
-template <typename T>
-class Relu3GradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("relu3_grad");
-    op->SetInput("Y", this->Output("Y"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class Relu3Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < in_t->numel(); ++i) {
-      y[i] = std::max(static_cast<T>(0.), x[i]);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Relu3GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    for (int i = 0; i < y_t->numel(); ++i) {
-      dx[i] = dy[i] * (y[i] > static_cast<T>(0) ? 1. : 0.);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OPERATOR(relu3,
-                  ops::Relu3Op,
-                  ops::Relu3OpMaker,
-                  ops::Relu3GradMaker<paddle::framework::OpDesc>,
-                  ops::Relu3GradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(relu3_grad, ops::Relu3GradOp);
-REGISTER_OP_CPU_KERNEL(relu3,
-                       ops::Relu3Kernel<CPU, float>,
-                       ops::Relu3Kernel<CPU, double>);
-REGISTER_OP_CPU_KERNEL(relu3_grad,
-                       ops::Relu3GradKernel<CPU, float>,
-                       ops::Relu3GradKernel<CPU, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op3.cu b/python/paddle/fluid/tests/custom_op/relu_op3.cu
deleted file mode 100644
index 8a229cafeb..0000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op3.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeRelu3(const T* x, const int num, T* y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    y[i] = max(x[i], static_cast<T>(0.));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu3CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = in_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu3<T><<<grid, block, 0, dev_ctx.stream()>>>(x, num, y);
-  }
-};
-
-template <typename T>
-__global__ void KeRelu3Grad(const T* y, const T* dy, const int num, T* dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu3GradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = dy_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu3Grad<T><<<grid, block, 0, dev_ctx.stream()>>>(y, dy, num, dx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(relu3,
-                        paddle::operators::Relu3CUDAKernel<CUDA, float>,
-                        paddle::operators::Relu3CUDAKernel<CUDA, double>);
-
-REGISTER_OP_CUDA_KERNEL(relu3_grad,
-                        paddle::operators::Relu3GradCUDAKernel<CUDA, float>,
-                        paddle::operators::Relu3GradCUDAKernel<CUDA, double>);
diff --git a/python/paddle/fluid/tests/custom_op/setup_build.py b/python/paddle/fluid/tests/custom_op/setup_build.py
deleted file mode 100644
index 16a7477930..0000000000
--- a/python/paddle/fluid/tests/custom_op/setup_build.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-from utils import paddle_includes, extra_compile_args
-from paddle.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension, setup
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-file_dir = os.path.dirname(os.path.abspath(__file__))
-
-setup(
-    name='librelu2_op_from_setup',
-    ext_modules=[
-        CUDAExtension(
-            sources=['relu_op3.cc', 'relu_op3.cu', 'relu_op.cc',
-                     'relu_op.cu'],  # test for multi ops
-            include_dirs=paddle_includes,
-            extra_compile_args=extra_compile_args)
-    ],
-    cmdclass={
-        'build_ext': BuildExtension.with_options(
-            no_python_abi_suffix=True, output_dir=file_dir)  # for unittest
-    })
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_op.py b/python/paddle/fluid/tests/custom_op/test_custom_op.py
deleted file mode 100644
index 1c0db0be15..0000000000
--- a/python/paddle/fluid/tests/custom_op/test_custom_op.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import numpy as np
-import unittest
-import contextlib
-
-import paddle
-import paddle.fluid as fluid
-paddle.enable_static()
-
-
-def load_so(so_name):
-    """
-    Load .so file and parse custom op into OpInfoMap.
-    """
-    file_dir = os.path.dirname(os.path.abspath(__file__))
-    fluid.load_op_library(os.path.join(file_dir, so_name))
-
-
-from paddle.fluid.layer_helper import LayerHelper
-
-
-def relu2(x, name=None):
-    helper = LayerHelper("relu2", **locals())
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False)
-    helper.append_op(type="relu2", inputs={"X": x}, outputs={"Y": out})
-    return out
-
-
-@contextlib.contextmanager
-def scope_prog_guard():
-    prog = fluid.Program()
-    startup_prog = fluid.Program()
-    scope = fluid.core.Scope()
-    with fluid.scope_guard(scope):
-        with fluid.program_guard(prog, startup_prog):
-            yield
-
-
-def linear_fc(data, label, use_custom_relu):
-    hidden = fluid.layers.fc(data, size=128)
-    hidden = relu2(hidden) if use_custom_relu else fluid.layers.relu(hidden)
-    hidden = fluid.layers.fc(hidden, size=128)
-    hidden = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=hidden, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def custom_op_test(use_gpu=True, use_custom_relu=True):
-    with scope_prog_guard():
-        np.random.seed(0)
-        fluid.default_startup_program().random_seed = 10
-        fluid.default_main_program().random_seed = 10
-
-        data = fluid.layers.data(
-            name='data', shape=[1, 28, 28], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        loss = linear_fc(data, label, use_custom_relu)
-
-        optimizer = fluid.optimizer.Momentum(learning_rate=0.1, momentum=0.9)
-        optimizer.minimize(loss)
-
-        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        compile_program = fluid.compiler.CompiledProgram(
-            fluid.default_main_program()).with_data_parallel(
-                loss_name=loss.name)
-
-        reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=32)
-        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-        num = 4
-        for i, data in enumerate(reader()):
-            outs, = exe.run(compile_program,
-                            feed=feeder.feed(data),
-                            fetch_list=[loss])
-            if i == num:
-                break
-        return outs
-
-
-class CustomOpTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(2)
-
-    def test_cpu(self):
-        actual = custom_op_test(False, True)
-        expect = custom_op_test(False, False)
-        self.assertEqual(actual.all(), expect.all())
-
-    def test_gpu(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        actual = custom_op_test(True, True)
-        expect = custom_op_test(True, False)
-        self.assertEqual(actual.all(), expect.all())
-
-
-if __name__ == '__main__':
-    load_so(so_name='librelu2_op.so')
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 23733d2084..641630b0f4 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -103,11 +103,11 @@ class TestJITLoad(unittest.TestCase):
                 in str(e))
             if IS_WINDOWS:
                 self.assertTrue(
-                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:48"
+                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:47"
                     in str(e))
             else:
                 self.assertTrue(
-                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:48"
+                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:47"
                     in str(e))
         self.assertTrue(caught_exception)
 
diff --git a/python/paddle/fluid/tests/custom_op/test_jit_load.py b/python/paddle/fluid/tests/custom_op/test_jit_load.py
deleted file mode 100644
index 4e6d74b7d6..0000000000
--- a/python/paddle/fluid/tests/custom_op/test_jit_load.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import paddle
-import numpy as np
-from paddle.utils.cpp_extension import load
-from utils import paddle_includes, extra_cc_args, extra_nvcc_args
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-# Compile and load custom op Just-In-Time.
-custom_module = load(
-    name='custom_relu2',
-    sources=['relu_op.cc', 'relu_op.cu', 'relu_op3.cc', 'relu_op3.cu'],
-    extra_include_paths=paddle_includes,  # add for Coverage CI
-    extra_cxx_cflags=extra_cc_args,  # test for cc flags
-    extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
-    verbose=True  # add for unittest
-)
-
-
-class TestJITLoad(unittest.TestCase):
-    def test_api(self):
-        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
-        gt_data = np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')
-        x = paddle.to_tensor(raw_data, dtype='float32')
-        # use custom api
-        out = custom_module.relu2(x)
-        out3 = custom_module.relu3(x)
-
-        self.assertTrue(np.array_equal(out.numpy(), gt_data))
-        self.assertTrue(np.array_equal(out3.numpy(), gt_data))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_setup_build.py b/python/paddle/fluid/tests/custom_op/test_setup_build.py
deleted file mode 100644
index 1ef14c2e3a..0000000000
--- a/python/paddle/fluid/tests/custom_op/test_setup_build.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import numpy as np
-from test_custom_op import CustomOpTest, load_so
-import paddle
-from paddle.utils.cpp_extension.extension_utils import run_cmd
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-
-def compile_so():
-    """
-    Compile .so file by running setup.py config.
-    """
-    # build .so with setup.py
-    file_dir = os.path.dirname(os.path.abspath(__file__))
-    cmd = 'cd {} && python setup_build.py build'.format(file_dir)
-    run_cmd(cmd)
-
-
-# `setup.py build` only produce .so file containing multi operators.
-#  Python Interface should be added manually. `relu2` api is in `test_custom_op.py`
-def relu3(x, name=None):
-    helper = LayerHelper("relu3", **locals())
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False)
-    helper.append_op(type="relu3", inputs={"X": x}, outputs={"Y": out})
-    return out
-
-
-class TestCompileMultiOp(unittest.TestCase):
-    def setUp(self):
-        paddle.disable_static()
-
-    def test_relu3(self):
-        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
-        x = paddle.to_tensor(raw_data, dtype='float32')
-        # use custom api
-        out = relu3(x)
-
-        self.assertTrue(
-            np.array_equal(out.numpy(),
-                           np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')))
-
-    def tearDown(self):
-        paddle.enable_static()
-
-
-if __name__ == '__main__':
-    compile_so()
-    load_so(so_name='librelu2_op_from_setup.so')
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_setup_install.py b/python/paddle/fluid/tests/custom_op/test_setup_install.py
deleted file mode 100644
index 1fd7b8a06f..0000000000
--- a/python/paddle/fluid/tests/custom_op/test_setup_install.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import site
-import unittest
-import paddle
-import subprocess
-import numpy as np
-from paddle.utils.cpp_extension.extension_utils import run_cmd
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-
-class TestSetUpInstall(unittest.TestCase):
-    def setUp(self):
-        cur_dir = os.path.dirname(os.path.abspath(__file__))
-        # compile, install the custom op egg into site-packages under background
-        cmd = 'cd {} && python setup_install.py install'.format(cur_dir)
-        run_cmd(cmd)
-
-        # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
-        # But we simulate to pip install in current process, so interpreter don't snap
-        # sys.path has been updated. So we update it manually.
-
-        # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3
-        site_dir = site.getsitepackages()[0]
-        custom_egg_path = [
-            x for x in os.listdir(site_dir) if 'custom_relu2' in x
-        ]
-        assert len(custom_egg_path) == 1, "Matched egg number is %d." % len(
-            custom_egg_path)
-        sys.path.append(os.path.join(site_dir, custom_egg_path[0]))
-
-    def test_api(self):
-        # usage: import the package directly
-        import custom_relu2
-
-        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
-        gt_data = np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')
-        x = paddle.to_tensor(raw_data, dtype='float32')
-        # use custom api
-        out = custom_relu2.relu2(x)
-        out3 = custom_relu2.relu3(x)
-
-        self.assertTrue(np.array_equal(out.numpy(), gt_data))
-        self.assertTrue(np.array_equal(out3.numpy(), gt_data))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b5c554a58c..0c292d355d 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -450,6 +450,7 @@ py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_stat
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_static_runner_while MODULES test_imperative_static_runner_while ENVS
     FLAGS_cudnn_deterministic=1)
+set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
     # FIXME(typhoonzero): add these tests back
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
index c28997c5c1..517cff39a2 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
@@ -233,6 +233,7 @@ def for_iter_var_idx(x_array):
     return z
 
 
+# 17. for a,b,c in z: (a, b, c) is a tuple
 @paddle.jit.to_static
 def for_tuple_as_iter_var(x_array):
     x = paddle.to_tensor(x_array)
@@ -250,6 +251,7 @@ def for_tuple_as_iter_var(x_array):
     return a_result, b_result, c_result
 
 
+# 18. for t in enumerate(collection): t is tuple of (idx, element)
 @paddle.jit.to_static
 def for_tuple_as_enumerate_iter(x_array):
     x = paddle.to_tensor(x_array)
@@ -263,6 +265,7 @@ def for_tuple_as_enumerate_iter(x_array):
     return a_result
 
 
+# 19. for i, (a, b, c, d, e) in enumerate(collection): (a, b, c, d, e) is a tuple
 @paddle.jit.to_static
 def for_tuple_as_enumerate_value(x_array):
     x = paddle.to_tensor(x_array)
@@ -284,6 +287,23 @@ def for_tuple_as_enumerate_value(x_array):
     return a_result
 
 
+# 20. test for function in a class
+class ForwardContainsForLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(ForwardContainsForLayer, self).__init__()
+        self.high = 5
+        self.low = 3
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        # just for test case, x is useless in this method
+        y = paddle.zeros([10, 2, 3])
+        z = []
+        for i in range(self.high - self.low):
+            z.append(y[i].clone())
+        return z
+
+
 class TestTransformBase(unittest.TestCase):
     def setUp(self):
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
@@ -313,11 +333,11 @@ class TestTransformBase(unittest.TestCase):
 class TestTransform(TestTransformBase):
     def transformed_result_compare(self):
         dy_outs = self.get_dygraph_output()
-        if not isinstance(dy_outs, tuple):
+        if not isinstance(dy_outs, (tuple, list)):
             dy_outs = (dy_outs, )
 
         st_outs = self.get_static_output()
-        if not isinstance(st_outs, tuple):
+        if not isinstance(st_outs, (tuple, list)):
             st_outs = (st_outs, )
 
         for x, y in zip(dy_outs, st_outs):
@@ -446,5 +466,10 @@ class TestForTupleAsEnumerateValue(TestForIterVarNumpy):
         self.dygraph_func = for_tuple_as_enumerate_value
 
 
+class TestForwardContainsForLayer(TestForIterVarNumpy):
+    def set_test_func(self):
+        self.dygraph_func = ForwardContainsForLayer()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 993493a3cc..010086bfbb 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -46,6 +46,7 @@ class InferencePassTest(unittest.TestCase):
         self.enable_mkldnn = False
         self.enable_mkldnn_bfloat16 = False
         self.enable_trt = False
+        self.enable_tensorrt_oss = True
         self.trt_parameters = None
         self.dynamic_shape_params = None
         self.enable_lite = False
@@ -133,6 +134,8 @@ class InferencePassTest(unittest.TestCase):
                         self.dynamic_shape_params.max_input_shape,
                         self.dynamic_shape_params.optim_input_shape,
                         self.dynamic_shape_params.disable_trt_plugin_fp16)
+                if self.enable_tensorrt_oss:
+                    config.enable_tensorrt_oss()
 
         elif use_mkldnn:
             config.enable_mkldnn()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
new file mode 100644
index 0000000000..8bbba7c8b5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTAffineChannelTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 2
+        self.channel = 8
+        self.height = 16
+        self.width = 16
+        self.data_layout = 'NCHW'
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.enable_trt = True
+
+    def build(self):
+        # set min_graph_size to 2, 
+        # because affine channel doesn't support nhwc format
+        self.trt_parameters = InferencePassTest.TensorRTParam(
+            1 << 30, self.bs, 2, self.precision, self.serialize, False)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            if self.data_layout == 'NCHW':
+                shape = [-1, self.channel, self.height, self.width]
+            else:
+                shape = [-1, self.height, self.width, self.channel]
+
+            data = fluid.data(name='in', shape=shape, dtype='float32')
+            # set scale, bias by constant
+            scale = fluid.layers.create_parameter(
+                shape=[self.channel],
+                dtype='float32',
+                default_initializer=fluid.initializer.Constant(2.))
+            bias = fluid.layers.create_parameter(
+                shape=[self.channel],
+                dtype='float32',
+                default_initializer=fluid.initializer.Constant(.5))
+            affine_channel_out = fluid.layers.affine_channel(
+                data, scale=scale, bias=bias, data_layout=self.data_layout)
+            out = fluid.layers.batch_norm(affine_channel_out, is_test=True)
+
+        shape[0] = self.bs
+        self.feeds = {'in': np.random.random(shape).astype('float32'), }
+        self.fetch_list = [out]
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            atol = 1e-5
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Half:
+                atol = 1e-3
+            self.check_output_with_option(use_gpu, atol, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def run_test_all(self):
+        precision_opt = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_opt = [False, True]
+
+        if self.data_layout == 'NCHW':
+            min_shape = [
+                self.bs, self.channel, self.height // 2, self.width // 2
+            ]
+            max_shape = [self.bs, self.channel, self.height * 2, self.width * 2]
+            opt_shape = [self.bs, self.channel, self.height, self.width]
+
+        if self.data_layout == 'NHWC':
+            min_shape = [
+                self.bs, self.height // 2, self.width // 2, self.channel
+            ]
+            max_shape = [self.bs, self.height * 2, self.width * 2, self.channel]
+            opt_shape = [self.bs, self.height, self.width, self.channel]
+
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
+            'in': min_shape
+        }, {'in': max_shape}, {'in': opt_shape}, False)
+        dynamic_shape_opt = [None, dynamic_shape_profile]
+
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_opt, serialize_opt, dynamic_shape_opt):
+            self.precision = precision
+            self.serialize = serialize
+            self.dynamic_shape_params = dynamic_shape
+            self.run_test()
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
+            'in': [self.bs, self.channel, self.height // 2, self.width // 2]
+        }, {'in': [self.bs, self.channel, self.height * 2, self.width * 2]
+            }, {'in': [self.bs, self.channel, self.height, self.width]}, False)
+        self.run_test()
+
+    def test_nchw_all(self):
+        self.run_test_all()
+
+    def test_nhwc(self):
+        self.data_layout = 'NHWC'
+        self.run_test_all()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
new file mode 100644
index 0000000000..3ca6985985
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TensorRTMultiClassNMSTest(InferencePassTest):
+    def setUp(self):
+        self.enable_trt = True
+        self.enable_tensorrt_oss = True
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.bs = 1
+        self.background_label = -1
+        self.score_threshold = .5
+        self.nms_top_k = 8
+        self.nms_threshold = .3
+        self.keep_top_k = 8
+        self.normalized = False
+        self.num_classes = 8
+        self.num_boxes = 8
+        self.trt_parameters = InferencePassTest.TensorRTParam(
+            1 << 30, self.bs, 2, self.precision, self.serialize, False)
+
+    def build(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            boxes = fluid.data(
+                name='bboxes', shape=[-1, self.num_boxes, 4], dtype='float32')
+            scores = fluid.data(
+                name='scores',
+                shape=[-1, self.num_classes, self.num_boxes],
+                dtype='float32')
+            multiclass_nms_out = fluid.layers.multiclass_nms(
+                bboxes=boxes,
+                scores=scores,
+                background_label=self.background_label,
+                score_threshold=self.score_threshold,
+                nms_top_k=self.nms_top_k,
+                nms_threshold=self.nms_threshold,
+                keep_top_k=self.keep_top_k,
+                normalized=self.normalized)
+            mutliclass_nms_out = multiclass_nms_out + 1.
+            multiclass_nms_out = fluid.layers.reshape(
+                multiclass_nms_out, [self.bs, 1, self.keep_top_k, 6],
+                name='reshape')
+            out = fluid.layers.batch_norm(multiclass_nms_out, is_test=True)
+
+        boxes_data = np.arange(self.num_boxes * 4).reshape(
+            [self.bs, self.num_boxes, 4]).astype('float32')
+        scores_data = np.arange(1 * self.num_classes * self.num_boxes).reshape(
+            [self.bs, self.num_classes, self.num_boxes]).astype('float32')
+        self.feeds = {
+            'bboxes': boxes_data,
+            'scores': scores_data,
+        }
+        self.fetch_list = [out]
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def run_test_all(self):
+        precision_opt = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_opt = [False, True]
+        max_shape = {
+            'bboxes': [self.bs, self.num_boxes, 4],
+            'scores': [self.bs, self.num_classes, self.num_boxes],
+        }
+        opt_shape = max_shape
+        dynamic_shape_opt = [
+            None, InferencePassTest.DynamicShapeParam({
+                'bboxes': [1, 1, 4],
+                'scores': [1, 1, 1]
+            }, max_shape, opt_shape, False)
+        ]
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_opt, serialize_opt, dynamic_shape_opt):
+            self.precision = precision
+            self.serialize = serialize
+            self.dynamic_shape_params = dynamic_shape
+            self.build()
+            self.check_output()
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        max_shape = {
+            'bboxes': [self.bs, self.num_boxes, 4],
+            'scores': [self.bs, self.num_classes, self.num_boxes],
+        }
+        opt_shape = max_shape
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
+            'bboxes': [1, 1, 4],
+            'scores': [1, 1, 1]
+        }, max_shape, opt_shape, False)
+        self.run_test()
+
+    def test_background(self):
+        self.background = 7
+        self.run_test()
+
+    def test_disable_oss(self):
+        self.diable_tensorrt_oss = False
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py
new file mode 100644
index 0000000000..fa276dd342
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTRoiAlignTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 2
+        self.num_rois = 4
+        self.channel = 16
+        self.height = 32
+        self.width = 32
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.enable_trt = True
+
+    def build(self):
+        self.trt_parameters = TRTRoiAlignTest.TensorRTParam(
+            1 << 30, self.bs * self.num_rois, 1, self.precision, self.serialize,
+            False)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_shape = [-1, self.channel, self.height, self.width]
+            data = fluid.data(name='data', shape=data_shape, dtype='float32')
+            rois = fluid.data(
+                name='rois', shape=[-1, 4], dtype='float32', lod_level=1)
+            roi_align_out = fluid.layers.roi_align(data, rois)
+            out = fluid.layers.batch_norm(roi_align_out, is_test=True)
+
+        rois_lod = fluid.create_lod_tensor(
+            np.random.random([self.bs * self.num_rois, 4]).astype('float32'),
+            [[self.num_rois, self.num_rois]], fluid.CPUPlace())
+
+        data_shape[0] = self.bs
+        self.feeds = {
+            'data': np.random.random(data_shape).astype('float32'),
+            'rois': rois_lod,
+        }
+        self.fetch_list = [out]
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            atol = 1e-5
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Half:
+                atol = 1e-3
+            self.check_output_with_option(use_gpu, atol, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def set_dynamic(self):
+        min_shape_spec = dict()
+        max_shape_spec = dict()
+        opt_shape_spec = dict()
+        min_shape_spec['data'] = [
+            self.bs, self.channel, self.height // 2, self.width // 2
+        ]
+        min_shape_spec['rois'] = [1, 4]
+        max_shape_spec[
+            'data'] = [self.bs, self.channel, self.height * 2, self.width * 2]
+        max_shape_spec['rois'] = [self.bs * self.num_rois, 4]
+        opt_shape_spec[
+            'data'] = [self.bs, self.channel, self.height, self.width]
+        opt_shape_spec['rois'] = [self.bs * self.num_rois, 4]
+
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam(
+            min_shape_spec, max_shape_spec, opt_shape_spec, False)
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_fp16(self):
+        self.set_dynamic()
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_dynamic_serialize(self):
+        self.set_dynamic()
+        self.serialize = True
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
index 67a1253b2c..4530e04d8d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
@@ -48,5 +48,33 @@ class TRTScaleTest(InferencePassTest):
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
+class TRTScaleShape2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 512, 512], dtype="float32")
+            scale_out = self.append_scale(data)
+            out = fluid.layers.batch_norm(scale_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 512, 512]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTScaleShape2Test.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def append_scale(self, data):
+        return fluid.layers.scale(
+            x=data, scale=2.0, bias=-1.0, bias_after_scale=False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
new file mode 100644
index 0000000000..cff8091cd9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTYoloBoxTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            image_shape = [self.bs, self.channel, self.height, self.width]
+            image = fluid.data(name='image', shape=image_shape, dtype='float32')
+            image_size = fluid.data(
+                name='image_size', shape=[self.bs, 2], dtype='int32')
+            boxes, scores = self.append_yolobox(image, image_size)
+            scores = fluid.layers.reshape(scores, (self.bs, -1))
+            out = fluid.layers.batch_norm(scores, is_test=True)
+
+        self.feeds = {
+            'image': np.random.random(image_shape).astype('float32'),
+            'image_size': np.random.randint(
+                32, 64, size=(self.bs, 2)).astype('int32'),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTYoloBoxTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out, boxes]
+
+    def set_params(self):
+        self.bs = 4
+        self.channel = 255
+        self.height = 64
+        self.width = 64
+        self.class_num = 80
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        self.conf_thresh = .1
+        self.downsample_ratio = 32
+
+    def append_yolobox(self, image, image_size):
+        return fluid.layers.yolo_box(
+            x=image,
+            img_size=image_size,
+            class_num=self.class_num,
+            anchors=self.anchors,
+            conf_thresh=self.conf_thresh,
+            downsample_ratio=self.downsample_ratio)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
new file mode 100644
index 0000000000..93dc45f265
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
@@ -0,0 +1,153 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION, fusion_lstm
+
+
+class TestFusionLSTMINT8MKLDNNOp(OpTest):
+    def set_confs(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "fusion_lstm"
+        self.lod = [[2, 3, 5, 4]]
+        self.IC = 3
+        self.OC = 5
+        self.is_reverse = False
+        self.has_initial_state = False
+        self.act_cell = 'tanh'
+        self.act_gate = 'sigmoid'
+        self.act_cand = 'tanh'
+        self.use_peepholes = False  # LSTM u8 doesn't support peepholes
+        self.use_mkldnn = True
+        self.force_fp32_output = False
+        self.error_margin = 1e-5
+        self.set_confs()
+
+        # RNN dimensions
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+
+        # Input data
+        x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1
+        scale_data = 63.0
+        shift_data = 64.0
+        x_u8 = np.rint(x_f32 * scale_data + shift_data).astype(np.uint8)
+
+        # WeightX/WeightH data
+        wx = np.random.rand(self.IC, 4 * self.OC).astype('float32') * 2 - 1
+        wh = np.random.rand(self.OC, 4 * self.OC).astype('float32') * 2 - 1
+
+        # Calculating weight scales
+        # scales = 127 / max(abs(channel_wise(weightsX + weightsH)))
+        s8_max = 127.0
+
+        scale_weights = s8_max / np.max(
+            np.abs(np.concatenate(
+                [wx[:, :], wh[:, :]], axis=0)), axis=0)
+
+        scale_weights = scale_weights.astype('float')
+
+        if self.use_peepholes:
+            b = np.random.rand(1, 7 * self.OC).astype('float32')
+        else:
+            b = np.random.rand(1, 4 * self.OC).astype('float32')
+        w_b = np.copy(b[:, 0:4 * self.OC])
+        w_c = b[:, 4 * self.OC:] if self.use_peepholes else None
+
+        bx = np.random.normal(size=(1, 4 * self.OC)).astype('float32')
+        b[0, 0:4 * self.OC] += bx[0, :]
+
+        if self.has_initial_state:
+            h0 = np.random.rand(N, self.OC).astype('float32')
+            c0 = np.random.rand(N, self.OC).astype('float32')
+        else:
+            h0 = np.zeros((N, self.OC)).astype('float32')
+            c0 = np.zeros((N, self.OC)).astype('float32')
+
+        hidden_f32, c = fusion_lstm(
+            x_f32, self.lod, wx, bx, h0, c0, wh, w_b, w_c, self.is_reverse,
+            ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+            ACTIVATION[self.act_cand])
+
+        self.inputs = {
+            'X': (x_u8, self.lod),
+            'WeightX': wx,
+            'WeightH': wh,
+            'Bias': b
+        }
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        if self.force_fp32_output:
+            self.error_margin = 1e-1
+            self.outputs = {
+                'Hidden': (hidden_f32, self.lod),
+                'Cell': (c, self.lod)
+            }
+        else:
+            self.error_margin = 2
+            hidden_u8 = np.rint(hidden_f32 * scale_data + shift_data).astype(
+                np.uint8)
+            self.outputs = {
+                'Hidden': (hidden_u8, self.lod),
+                'Cell': (c, self.lod)
+            }
+
+        self.attrs = {
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand,
+            'is_reverse': self.is_reverse,
+            'use_peepholes': self.use_peepholes,
+            'use_mkldnn': self.use_mkldnn,
+            'force_fp32_output': self.force_fp32_output,
+            'Scale_data': scale_data,
+            'Shift_data': shift_data,
+            'Scale_weights': scale_weights
+        }
+
+    def test_check_output(self):
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output(
+                check_dygraph=False,
+                no_check_set=["Cell"],
+                atol=self.error_margin)
+
+
+class TestFusionLSTMINT8MKLDNNOp2(TestFusionLSTMINT8MKLDNNOp):
+    def set_confs(self):
+        self.force_fp32_output = True
+
+
+class TestFusionLSTMINT8MKLDNNOp4(TestFusionLSTMINT8MKLDNNOp):
+    def set_confs(self):
+        self.is_reverse = True
+
+
+class TestFusionLSTMINT8MKLDNNOp5(TestFusionLSTMINT8MKLDNNOp):
+    def set_confs(self):
+        self.has_initial_state = True
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
index f433af2481..2e8b0b7e6b 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
@@ -66,12 +66,21 @@ def cnn_model(data):
     param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
     scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
 
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
+    with fluid.device_guard("gpu:1"):
+        predict = fluid.layers.fc(
+            input=conv_pool_2,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        # To cover @RENAMED@GRADIENT
+        predict2 = fluid.layers.fc(
+            input=conv_pool_1,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        predict += predict2
     return predict
 
 
@@ -108,7 +117,14 @@ class TestDistMnist2x2(TestDistRunnerBase):
         bd = [steps_per_pass * p for p in passes]
         lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
         lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+<<<<<<< HEAD
         opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9)
+=======
+        opt = fluid.optimizer.Momentum(
+            learning_rate=lr_val,
+            momentum=0.9,
+            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
         acc_steps = 2  # accumulated steps for pipeline
         if dist_strategy:
@@ -120,6 +136,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
             fleet.init(is_collective=True)
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
+<<<<<<< HEAD
+=======
+            strategy.amp = True
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
             strategy.pipeline_configs = {
                 'micro_batch_size': batch_size,
                 'schedule_mode': '1F1B',
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index bcf80fa477..ea183e9444 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -1971,9 +1971,9 @@ class TestPow_factor_tensor(TestActivation):
             feed={"x": input},
             fetch_list=[out_1, out_2, res, out_6])
 
-        assert np.array_equal(res_1, np.power(input, 2))
-        assert np.array_equal(res_2, np.power(input, 3))
-        assert np.array_equal(res_6, np.power(input, 3))
+        assert np.allclose(res_1, np.power(input, 2))
+        assert np.allclose(res_2, np.power(input, 3))
+        assert np.allclose(res_6, np.power(input, 3))
 
     def test_error(self):
         in1 = fluid.layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 29c35d28d4..83bba0b0ca 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -1465,5 +1465,41 @@ class TestConv2DAPI_Error(unittest.TestCase):
         self.assertRaises(ValueError, run_7)
 
 
+# --------- test environment variable ------
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()),
+    "core is not compiled with CUDA or ROCM")
+class TestConv2DEnviron(unittest.TestCase):
+    def run_conv2d_api(self):
+        inputs = fluid.layers.data(
+            shape=[2, 3, 5, 5],
+            append_batch_size=False,
+            name="inputs",
+            dtype="float32")
+        fluid.layers.conv2d(
+            input=inputs,
+            num_filters=4,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding=0,
+            dilation=[1, 1],
+            groups=1,
+            data_format="NCHW")
+
+        x_var = paddle.uniform((2, 3, 5, 5), dtype="float32", min=-1., max=1.)
+        conv = paddle.nn.Conv2D(
+            in_channels=3,
+            out_channels=4,
+            kernel_size=(3, 3),
+            data_format="NCHW")
+        y_var = conv(x_var)
+
+    def test_environ(self):
+        fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False})
+        self.run_conv2d_api()
+        fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True})
+        self.run_conv2d_api()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 81e2160a55..1a5e4b2835 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -18,6 +18,8 @@ import paddle
 import paddle.fluid as fluid
 import numpy as np
 import unittest
+from test_softmax_op import stable_softmax
+from test_softmax_with_cross_entropy_op import cross_entropy
 
 
 def stable_softmax(x):
@@ -42,6 +44,8 @@ def cross_entropy_loss_1d(input,
     C = input_shape[1]
     out = np.zeros_like(label).astype(np.float64)
     total_weight = 0
+    ###1. compute softmax cross_entropy (with weight)
+    ###   Note: only support hard labels.
     for i in range(N):
         cur_target = label[i]
         if cur_target == ignore_index:
@@ -50,6 +54,8 @@ def cross_entropy_loss_1d(input,
         cur_weight = weight[cur_target] if weight is not None else 1
         total_weight += cur_weight
         out[i] = -log_softmax_out[i][cur_target] * cur_weight
+
+    ###2. deal with reduction 
     if reduction == 'sum':
         return np.sum(out), np.array([total_weight]).astype('float64')
     elif reduction == 'mean':
@@ -92,7 +98,620 @@ def cross_entropy_loss_2d(input,
         return out
 
 
+def cross_entropy_soft(softmax,
+                       label,
+                       axis,
+                       N,
+                       weight=None,
+                       reduction='mean',
+                       ignore_index=-100):
+    #1.loss
+    loss = cross_entropy(
+        softmax,
+        label,
+        True,  #soft_label,
+        axis,
+        ignore_index)
+
+    if weight is None and reduction == 'none':
+        return loss
+
+    #2.weight
+    weighted_loss = loss
+    total_weight = N  #for weight is None
+    if weight is not None:
+        weighted_loss = np.zeros_like(loss).astype(np.float64)
+        total_weight = 0
+        for i in range(N):
+            cur_soft_label = label[i]
+            cur_weight = np.dot(weight, cur_soft_label)
+            total_weight += cur_weight
+            weighted_loss[i] = loss[i] * cur_weight
+
+    #3.reduce
+    if reduction == 'none':
+        return weighted_loss
+
+    elif reduction == 'mean':
+        weighted_loss_sum = np.sum(weighted_loss)
+        weighted_loss_mean = weighted_loss_sum / total_weight
+        return weighted_loss_mean
+
+    else:
+        weighted_loss_sum = np.sum(weighted_loss)
+        return weighted_loss_sum
+
+
+def cross_entropy_soft_2d(softmax,
+                          label,
+                          axis,
+                          N,
+                          H,
+                          W,
+                          weight=None,
+                          reduction='mean',
+                          ignore_index=-100):
+    #1.loss
+    loss = cross_entropy(
+        softmax,
+        label,
+        True,  #soft_label,
+        axis,
+        ignore_index)
+
+    if weight is None and reduction == 'none':
+        return loss
+
+    #2.weight
+    weighted_loss = loss
+    total_weight = N  #for weight is None
+    if weight is not None:
+        weighted_loss = np.zeros_like(loss).astype(np.float64)
+        total_weight = 0
+        for i in range(N):
+            for h in range(H):
+                for w in range(W):
+                    cur_soft_label = label[i][h][w]
+                    cur_weight = np.dot(weight, cur_soft_label)
+                    total_weight += cur_weight
+                    weighted_loss[i][h][w] = loss[i][h][w] * cur_weight
+
+    #3.reduce
+    if reduction == 'none':
+        return weighted_loss
+
+    elif reduction == 'mean':
+        weighted_loss_sum = np.sum(weighted_loss)
+        weighted_loss_mean = weighted_loss_sum / total_weight
+        return weighted_loss_mean
+
+    else:
+        weighted_loss_sum = np.sum(weighted_loss)
+        return weighted_loss_sum
+
+
 class CrossEntropyLoss(unittest.TestCase):
+
+    ###test for deprecated softmax_with_cross_entropy
+    def test_softmax_with_cross_entropy(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        paddle.disable_static()
+        paddle_loss_swce = paddle.nn.functional.softmax_with_cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis)
+
+        paddle_loss_ce = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight)
+            if self.weight is not None else None,
+            reduction=self.reduction)
+
+        self.assertTrue(np.allclose(paddle_loss_swce.numpy(), expected))
+        self.assertTrue(np.allclose(paddle_loss_ce.numpy(), expected))
+
+    ###soft_label test start
+    ###soft_label test 1
+    def test_cross_entropy_loss_soft_1d(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        #2. dygraph
+        paddle.disable_static()
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight)
+            if self.weight is not None else None,
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype='float64')
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype='float64')
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 2
+    def test_cross_entropy_loss_soft_1d_weight(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        if self.soft_label:
+            self.labels = np.random.uniform(0.1, 1.0,
+                                            self.shape).astype(self.dtype)
+            self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+        else:
+            axis_dim = self.shape[self.axis]
+            self.shape[self.axis] = 1
+            self.labels = np.random.randint(
+                0, axis_dim, self.shape, dtype="int64")
+
+        #1. numpy
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        #2. dygraph
+        paddle.disable_static()
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight),
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3.static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype='float64')
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype='float64')
+            weight = fluid.data(name='weight', shape=[self.C], dtype='float64')
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                     "weight": self.weight
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 3
+    def test_cross_entropy_loss_soft_1d_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        #2 dygraph 
+        paddle.disable_static()
+        paddle_loss_mean = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=self.weight,
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_mean.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype='float64')
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype='float64')
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={'input': self.logits,
+                      'label': self.labels},
+                fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 4
+    def test_cross_entropy_loss_soft_1d_weight_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        #2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight),
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype='float64')
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype='float64')
+            weight = fluid.data(name='weight', shape=[self.C], dtype='float64')
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                     "weight": self.weight
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 5
+    def test_cross_entropy_loss_soft_2d(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 3
+        self.H = 2
+        self.W = 2
+        self.C = 5
+        self.shape = [self.N, self.H, self.W, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft_2d(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            self.H,
+            self.W,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        #2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight)
+            if self.weight is not None else None,
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype='float64')
+            label = fluid.data(
+                name='label',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype='float64')
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 6
+    def test_cross_entropy_loss_soft_2d_weight_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 3
+        self.H = 2
+        self.W = 2
+        self.C = 5
+        self.shape = [self.N, self.H, self.W, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft_2d(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            self.H,
+            self.W,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        #2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight),
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype='float64')
+            label = fluid.data(
+                name='label',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype='float64')
+            weight = fluid.data(name='weight', shape=[self.C], dtype='float64')
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                     "weight": self.weight
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test end
+
     def test_cross_entropy_loss_1d_with_mean_ignore(self):
         input_np = np.random.random([2, 4]).astype(np.float64)
         label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
@@ -131,19 +750,21 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_mean_ignore(self):
-        input_np = np.random.random([2, 4]).astype(np.float64)
-        label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
-        weight_np = np.random.random([4]).astype(np.float64)  #shape:C
+        N = 100
+        C = 200
+        input_np = np.random.random([N, C]).astype(np.float64)
+        label_np = np.random.randint(0, C, size=(N)).astype(np.int64)
+        weight_np = np.random.random([C]).astype(np.float64)
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[2, 4], dtype='float64')
-            label = fluid.data(name='label', shape=[2], dtype='int64')
+            input = fluid.data(name='input', shape=[N, C], dtype='float64')
+            label = fluid.data(name='label', shape=[N], dtype='int64')
             weight = fluid.data(
-                name='weight', shape=[4],
+                name='weight', shape=[C],
                 dtype='float64')  #weight for each class
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, ignore_index=0)
@@ -158,8 +779,6 @@ class CrossEntropyLoss(unittest.TestCase):
                                  },
                                  fetch_list=[ret])
             self.assertIsNotNone(static_ret)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np)[0]
 
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -173,6 +792,7 @@ class CrossEntropyLoss(unittest.TestCase):
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_1d(
             input_np, label_np, weight=weight_np, ignore_index=0)[0]
+
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
@@ -265,6 +885,7 @@ class CrossEntropyLoss(unittest.TestCase):
         input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
         weight_np = np.random.random([200]).astype(np.float64)  #C
+
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
@@ -274,6 +895,7 @@ class CrossEntropyLoss(unittest.TestCase):
             input = fluid.data(name='input', shape=[100, 200], dtype='float64')
             label = fluid.data(name='label', shape=[100], dtype='int64')
             weight = fluid.data(name='weight', shape=[200], dtype='float64')
+
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='none')
             ret = cross_entropy_loss(input, label)
diff --git a/python/paddle/fluid/tests/unittests/test_data_generator.py b/python/paddle/fluid/tests/unittests/test_data_generator.py
index 6381cb3640..69d8e01fd4 100644
--- a/python/paddle/fluid/tests/unittests/test_data_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_data_generator.py
@@ -95,6 +95,32 @@ class MyMultiSlotDataGenerator_error_5(fleet.MultiSlotDataGenerator):
         return data_iter
 
 
+class MyMultiSlotStringDataGenerator_zip(fleet.MultiSlotStringDataGenerator):
+    def generate_sample(self, line):
+        def data_iter():
+            for i in range(40):
+                if i == 1:
+                    yield None
+                feature_name = ["words", "label"]
+                data = [["1", "2", "3", "4"], ["0"]]
+                yield zip(feature_name, data)
+
+        return data_iter
+
+
+class MyMultiSlotDataGenerator_zip(fleet.MultiSlotDataGenerator):
+    def generate_sample(self, line):
+        def data_iter():
+            for i in range(40):
+                if i == 1:
+                    yield None
+                feature_name = ["words", "label"]
+                data = [[1, 2, 3, 4], [0]]
+                yield zip(feature_name, data)
+
+        return data_iter
+
+
 class TestMultiSlotDataGenerator(unittest.TestCase):
     def test_MultiSlotDataGenerator_basic(self):
         my_ms_dg = MyMultiSlotDataGenerator()
@@ -149,5 +175,19 @@ class TestMultiSlotDataGenerator_error_5(unittest.TestCase):
             my_ms_dg.run_from_memory()
 
 
+class TestMultiSlotStringDataGeneratorZip(unittest.TestCase):
+    def test_MultiSlotStringDataGenerator_zip(self):
+        my_ms_dg = MyMultiSlotStringDataGenerator_zip()
+        my_ms_dg.set_batch(1)
+        my_ms_dg.run_from_memory()
+
+
+class TestMultiSlotDataGeneratorZip(unittest.TestCase):
+    def test_MultiSlotDataGenerator_zip(self):
+        my_ms_dg = MyMultiSlotDataGenerator_zip()
+        my_ms_dg.set_batch(1)
+        my_ms_dg.run_from_memory()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 01f0abe0f2..1d7bfc9f69 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -166,12 +166,14 @@ class TestMovingAverageAbsMaxScaleOp(OpTest):
         accum[0] = 1
         state = np.zeros(1).astype("float32")
         state[0] = 1
+        x = np.random.random((8, 16, 7, 7)).astype("float32")
         self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'X': x,
             'InAccum': accum,
             'InState': state,
         }
 
+        out = x
         out_accum = np.zeros(1).astype("float32")
         out_state = np.zeros(1).astype("float32")
         out_scale = np.zeros(1).astype("float32")
@@ -180,6 +182,7 @@ class TestMovingAverageAbsMaxScaleOp(OpTest):
         out_state[0] = self.attrs['moving_rate'] * state[0] + 1
         out_scale = out_accum / out_state
         self.outputs = {
+            'Out': out,
             'OutAccum': out_accum,
             'OutState': out_state,
             'OutScale': out_scale,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
index ef90dd0498..2e722b69c3 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
@@ -84,6 +84,18 @@ class TestImperativeContainer(unittest.TestCase):
             self.assertListEqual(res8.shape, [5, 3**3])
             res8.backward()
 
+            model4 = MyLayer(layerlist[:3])
+            model4.layerlist[-1] = fluid.dygraph.Linear(4, 5)
+            res9 = model4(x)
+            self.assertListEqual(res9.shape, [5, 5])
+            del model4.layerlist[-1]
+            res10 = model4(x)
+            self.assertListEqual(res10.shape, [5, 4])
+            model4.layerlist.insert(-1, fluid.dygraph.Linear(2, 2))
+            res11 = model4(x)
+            self.assertListEqual(res11.shape, [5, 4])
+            res11.backward()
+
     def test_layer_list(self):
         self.layer_list(True)
         self.layer_list(False)
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 8c6383cd6e..04a0d47e47 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -537,6 +537,18 @@ class TestLRScheduler(unittest.TestCase):
                 self._test_dygraph(python_func, paddle_api, kwarg, place)
                 paddle.enable_static()
 
+    def test_linear_warmp(self):
+        natural_lr = paddle.optimizer.lr.NaturalExpDecay(
+            learning_rate=0.5, gamma=0.1)
+        natural_lr_warmup = paddle.optimizer.lr.LinearWarmup(
+            learning_rate=natural_lr, warmup_steps=10, start_lr=0.0, end_lr=0.1)
+        for idx in range(30):
+            if idx >= 10:
+                self.assertEqual(natural_lr_warmup.get_lr(),
+                                 natural_lr.get_lr())
+                natural_lr.step()
+            natural_lr_warmup.step()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index 2d5f098a7f..b936567d5b 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -206,6 +206,42 @@ for dim_X in (1, 2, 3):
                 api_test(dim_X, dim_Y, transose_x, transose_y)
 
 
+# Test case more batch_size and N, M, K
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y,
+                               batch_size):
+    BATCH_SIZE = 2
+    M = 3
+    N = 4
+    K = 5
+    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
+        K = 1
+    if dim_X == 1:
+        if transpose_X:
+            shape_X = [M]
+        else:
+            shape_X = [K]
+    if dim_Y == 1:
+        if transpose_Y:
+            shape_Y = [N]
+        else:
+            shape_Y = [K]
+    if dim_X >= 2:
+        if transpose_X:
+            shape_X = [K, M]
+        else:
+            shape_X = [M, K]
+    if dim_X == 3:
+        shape_X = [BATCH_SIZE] + shape_X
+    if dim_Y >= 2:
+        if transpose_Y:
+            shape_Y = [N, K]
+        else:
+            shape_Y = [K, N]
+    if dim_Y == 3:
+        shape_Y = [BATCH_SIZE] + shape_Y
+    return shape_X, shape_Y
+
+
 # Test case n-dim
 def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     M = 2
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 761d318d7b..efcc0e4cfe 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -67,7 +67,7 @@ class TestMatMulV2Op(OpTest):
         self.trans_y = False
 
     def init_kernel_type(self):
-        self.dtype = "float64"
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
 
     def setUp(self):
         self.init_kernel_type()
@@ -91,7 +91,10 @@ class TestMatMulV2Op(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        if core.is_compiled_with_rocm():
+            self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
+        else:
+            self.check_grad(['X', 'Y'], 'Out')
 
 
 class TestMatMuklOp2(TestMatMulV2Op):
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 23dac41f64..0885891cdb 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -48,18 +48,37 @@ class TestSetValueBase(unittest.TestCase):
 
 
 class TestSetValueApi(TestSetValueBase):
-    def test_api(self):
+    def _run_static(self):
+        paddle.enable_static()
         with paddle.static.program_guard(self.program):
             x = paddle.ones(shape=self.shape, dtype=self.dtype)
             self._call_setitem(x)
 
         exe = paddle.static.Executor(paddle.CPUPlace())
         out = exe.run(self.program, fetch_list=[x])
+        paddle.disable_static()
+        return out
+
+    def _run_dynamic(self):
+        paddle.disable_static()
+        x = paddle.ones(shape=self.shape, dtype=self.dtype)
+        self._call_setitem(x)
+        out = x.numpy()
+        paddle.enable_static()
+        return out
+
+    def test_api(self):
+        static_out = self._run_static()
+        dynamic_out = self._run_dynamic()
         self._get_answer()
+
+        error_msg = "\nIn {} mode: \nExpected res = \n{}, \n\nbut received : \n{}"
+        self.assertTrue(
+            (self.data == static_out).all(),
+            msg=error_msg.format("static", self.data, static_out))
         self.assertTrue(
-            (self.data == out).all(),
-            msg="\nExpected res = \n{}, \n\nbut received : \n{}".format(
-                self.data, out))
+            (self.data == dynamic_out).all(),
+            msg=error_msg.format("dynamic", self.data, dynamic_out))
 
 
 # 1. Test different type of item: int, Python slice, Paddle Tensor
@@ -106,6 +125,23 @@ class TestSetValueItemSlice4(TestSetValueApi):
         self.data[0:, 1:2, :] = self.value
 
 
+class TestSetValueItemSliceInWhile(TestSetValueApi):
+    def _call_setitem(self, x):
+        def cond(i, x):
+            return i < 1
+
+        def body(i, x):
+            x[i] = self.value
+            i = i + 1
+            return i, x
+
+        i = paddle.zeros(shape=(1, ), dtype='int32')
+        i, x = paddle.fluid.layers.while_loop(cond, body, [i, x])
+
+    def _get_answer(self):
+        self.data[0] = self.value
+
+
 # 1.2.2 step > 1
 class TestSetValueItemSliceStep(TestSetValueApi):
     def set_shape(self):
@@ -671,6 +707,20 @@ class TestSetValueValueShape4(TestSetValueApi):
         self.data[0] = self.value
 
 
+class TestSetValueValueShape5(TestSetValueApi):
+    def set_value(self):
+        self.value = np.array([3, 3, 3]).astype(self.dtype)
+
+    def set_shape(self):
+        self.shape = [3, 4]
+
+    def _call_setitem(self, x):
+        x[:, 0] = paddle.assign(self.value)  # x is Paddle.Tensor
+
+    def _get_answer(self):
+        self.data[:, 0] = self.value
+
+
 # 4. Test error
 class TestError(TestSetValueBase):
     def _value_type_error(self):
@@ -717,6 +767,7 @@ class TestError(TestSetValueBase):
             exe.run(program)
 
     def test_error(self):
+        paddle.enable_static()
         with paddle.static.program_guard(self.program):
             self._value_type_error()
             self._dtype_error()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index b0c9dda7a3..1fea193547 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -76,6 +76,11 @@ class TestVarBase(unittest.TestCase):
                     y = x.cuda(blocking=True)
                     self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
 
+                # support 'dtype' is core.VarType
+                x = paddle.rand((2, 2))
+                y = paddle.to_tensor([2, 2], dtype=x.dtype)
+                self.assertEqual(y.dtype, core.VarDesc.VarType.FP32)
+
                 # set_default_dtype take effect on complex
                 x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 6310a76d8d..53f3b3cf53 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -20,6 +20,7 @@ import numpy as np
 from op_test import OpTest
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 import paddle
 import paddle.nn.functional as F
@@ -240,8 +241,18 @@ class TestWarpCTCOp(OpTest):
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
-        self.check_grad(
-            ["Logits"], "Loss", max_relative_error=0.007, check_dygraph=False)
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.009,
+                check_dygraph=False)
+        else:
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.007,
+                check_dygraph=False)
 
 
 class TestWarpCTCOpCase1(TestWarpCTCOp):
@@ -335,8 +346,18 @@ class TestWarpCTCOpWithPadding(OpTest):
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
-        self.check_grad(
-            ["Logits"], "Loss", max_relative_error=0.007, check_dygraph=False)
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.009,
+                check_dygraph=False)
+        else:
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.007,
+                check_dygraph=False)
 
 
 class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
index cb64cb90e8..f1ba8828f2 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
@@ -51,10 +51,10 @@ class TestCastOp2(op_test.OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
-        self.outputs = {'Out': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float16')}
         self.attrs = {
             'in_dtype': int(core.VarDesc.VarType.FP32),
-            'out_dtype': int(core.VarDesc.VarType.FP32)
+            'out_dtype': int(core.VarDesc.VarType.FP16)
         }
         self.op_type = 'cast'
 
@@ -68,10 +68,10 @@ class TestCastOp2(op_test.OpTest):
 class TestCastOp3(op_test.OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
-        self.inputs = {'X': ipt.astype('float32')}
+        self.inputs = {'X': ipt.astype('float16')}
         self.outputs = {'Out': ipt.astype('float32')}
         self.attrs = {
-            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'in_dtype': int(core.VarDesc.VarType.FP16),
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.op_type = 'cast'
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
index fa0feb02f4..54dc46cd0e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
@@ -27,8 +27,12 @@ from paddle.fluid import Program, program_guard
 paddle.enable_static()
 
 
-def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y,
+                               batch_size):
     BATCH_SIZE = 2
+    if batch_size != None:
+        BATCH_SIZE = batch_size
+
     M = 3
     N = 4
     K = 5
@@ -58,6 +62,13 @@ def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
             shape_Y = [K, N]
     if dim_Y == 3:
         shape_Y = [BATCH_SIZE] + shape_Y
+
+    if dim_Y == 3 and dim_X == 2:
+        if transpose_X == False:
+            shape_X[1] = shape_X[1] * BATCH_SIZE
+        else:
+            shape_X[0] = shape_X[0] * BATCH_SIZE
+
     return shape_X, shape_Y
 
 
@@ -77,11 +88,19 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
     if transpose_Y:
         if Y.ndim == 1:
             Y = Y.reshape((1, Y.size))
+        elif Y.ndim == 2:
+            Y = Y.T
         else:
             dim = [i for i in range(len(Y.shape))]
             dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
             Y = np.transpose(Y, tuple(dim))
 
+    if X.ndim == 3 and Y.ndim == 2:
+        x_dims = X.shape
+        X = X.reshape((x_dims[0] * x_dims[1], x_dims[2]))
+    if Y.ndim == 3 and X.ndim == 2:
+        y_dims = Y.shape
+        Y = Y.reshape((y_dims[0] * y_dims[1], y_dims[2]))
     Out = np.matmul(X, Y)
     if not Out.shape:
         # We do not support 0-dimensional Tensors (scalars). So where
@@ -203,11 +222,11 @@ def test_negative_dims_program(obj):
 
 
 # Generate program api cases for all negative possibilities
-def api_test(dim_x, dim_y, trans_x, trans_y):
+def api_test(dim_x, dim_y, trans_x, trans_y, batch_size):
     test_name = ('TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
         dim_x, dim_y, trans_x, trans_y))
     shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
-                                                  trans_y)
+                                                  trans_y, batch_size)
     globals()[test_name] = type(test_name, (unittest.TestCase, ), {
         'shape_X': shape_x,
         'shape_Y': shape_y,
@@ -218,29 +237,35 @@ def api_test(dim_x, dim_y, trans_x, trans_y):
 
 
 # Generate operators cases for all possibilities
-def inject_test(dim_x, dim_y, trans_x, trans_y):
-    test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-        dim_x, dim_y, trans_x, trans_y))
+def inject_test(dim_x, dim_y, trans_x, trans_y, batch_size):
+    test_name = (
+        'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'.format(
+            dim_x, dim_y, trans_x, trans_y, batch))
     shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
-                                                  trans_y)
+                                                  trans_y, batch_size)
     globals()[test_name] = type(test_name, (Generator, XPUOpTest), {
         'shape_X': shape_x,
         'shape_Y': shape_y,
         'transpose_X': trans_x,
         'transpose_Y': trans_y,
+        'op_type': "matmul"
     })
 
 
-for dim_X in (1, 2, 3):
-    for dim_Y in (1, 2, 3):
-        transose_x = False
-        transose_y = False
-        if dim_X == 3 and dim_Y == 3:
-            inject_test(dim_X, dim_Y, transose_x, transose_y)
-            api_test(dim_X, dim_Y, transose_x, transose_y)
+xpu_support_dims_list = [[1, 1], [2, 2], [3, 3]]
+batch_size = [2, 4, 5, 10, 50, 100, 300]
+for dims in xpu_support_dims_list:
+    dim_X = dims[0]
+    dim_Y = dims[1]
+    for transose_x in (False, True):
+        for transose_y in (False, True):
+            for batch in batch_size:
+                inject_test(dim_X, dim_Y, transose_x, transose_y, batch)
+            # xpu not support all negative possibilities
+            # api_test(dim_X, dim_Y, False, False, 10)
 
 
-# Test case n-dim
+            # Test case n-dim
 def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     M = 2
     N = 4
@@ -261,7 +286,7 @@ def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     return shape_X, shape_Y
 
 
-# # Test case n-dim
+# Test case n-dim
 for dim in [4]:
     for transpose_X in [False, True]:
         for transpose_Y in [False, True]:
@@ -275,6 +300,7 @@ for dim in [4]:
                 'shape_Y': shape_Y,
                 'transpose_X': transpose_X,
                 'transpose_Y': transpose_Y,
+                'op_type': "matmul"
             })
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 531e9488d6..435026220c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -45,7 +45,6 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
             dim = [i for i in range(len(Y.shape))]
             dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
             Y = np.transpose(Y, tuple(dim))
-
     Out = np.matmul(X, Y)
     if not Out.shape:
         # We do not support 0-dimensional Tensors (scalars). So where
@@ -98,16 +97,16 @@ class TestMatMulV2Op(XPUOpTest):
         self.check_grad_with_place(place, ['X', 'Y'], 'Out')
 
 
-# class TestMatMuklOp2(TestMatMulV2Op):
-#     """
-#     case 2
-#     """
+class TestMatMuklOp2(TestMatMulV2Op):
+    """
+    case 2
+    """
 
-#     def config(self):
-#         self.x_shape = (100, )
-#         self.y_shape = (1, 3, 2, 100)
-#         self.trans_x = False
-#         self.trans_y = True
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (100, 3)
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp3(TestMatMulV2Op):
@@ -122,16 +121,16 @@ class TestMatMuklOp3(TestMatMulV2Op):
         self.trans_y = False
 
 
-# class TestMatMuklOp4(TestMatMulV2Op):
-#     """
-#     case 4
-#     """
+class TestMatMuklOp4(TestMatMulV2Op):
+    """
+    case 4
+    """
 
-#     def config(self):
-#         self.x_shape = (100, )
-#         self.y_shape = (1, 2, 100, 2)
-#         self.trans_x = False
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (1, 1, 100, 1)
+        self.y_shape = (1, 100)
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp5(TestMatMulV2Op):
@@ -146,27 +145,28 @@ class TestMatMuklOp5(TestMatMulV2Op):
         self.trans_y = False
 
 
-# class TestMatMuklOp6(TestMatMulV2Op):
-#     """
-#     case 6
-#     """
+class TestMatMuklOp6(TestMatMulV2Op):
+    """
+    case 6
+    """
 
-#     def config(self):
-#         self.x_shape = (1, 2, 102, 1)
-#         self.y_shape = (102, )
-#         self.trans_x = True
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (1, 2, 102, 10)
+        self.y_shape = (2, 10, 111)
+        self.trans_x = False
+        self.trans_y = False
 
-# class TestMatMuklOp7(TestMatMulV2Op):
-#     """
-#     case 7
-#     """
 
-#     def config(self):
-#         self.x_shape = (1, 2, 1, 100)
-#         self.y_shape = (100, )
-#         self.trans_x = False
-#         self.trans_y = False
+class TestMatMuklOp7(TestMatMulV2Op):
+    """
+    case 7
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (2, 100, 12)
+        self.trans_x = True
+        self.trans_y = False
 
 
 class TestMatMuklOp8(TestMatMulV2Op):
@@ -181,49 +181,52 @@ class TestMatMuklOp8(TestMatMulV2Op):
         self.trans_y = False
 
 
-# class TestMatMuklOp9(TestMatMulV2Op):
-#     """
-#     case 9
-#     """
+class TestMatMuklOp9(TestMatMulV2Op):
+    """
+    case 9
+    """
 
-#     def config(self):
-#         self.x_shape = (1, 1, 1, 100)
-#         self.y_shape = (2, 1, 2, 100)
-#         self.trans_x = False
-#         self.trans_y = True
+    def config(self):
+        self.x_shape = (100, 20, 100)
+        self.y_shape = (100, 100, 100)
+        self.trans_x = False
+        self.trans_y = True
 
-# class TestMatMuklOp10(TestMatMulV2Op):
-#     """
-#     case 10
-#     """
 
-#     def config(self):
-#         self.x_shape = (1, 1, 25, 4)
-#         self.y_shape = (1, 2, 4, 25)
-#         self.trans_x = False
-#         self.trans_y = False
+class TestMatMuklOp10(TestMatMulV2Op):
+    """
+    case 10
+    """
 
-# class TestMatMuklOp11(TestMatMulV2Op):
-#     """
-#     case 11
-#     """
+    def config(self):
+        self.x_shape = (100, 20, 100)
+        self.y_shape = (100, 20, 100)
+        self.trans_x = True
+        self.trans_y = False
 
-#     def config(self):
-#         self.x_shape = (2, 1, 2, 100)
-#         self.y_shape = (1, 1, 100, 2)
-#         self.trans_x = False
-#         self.trans_y = False
 
-# class TestMatMuklOp12(TestMatMulV2Op):
-#     """
-#     case 12
-#     """
+class TestMatMuklOp11(TestMatMulV2Op):
+    """
+    case 11
+    """
 
-#     def config(self):
-#         self.x_shape = (2, 1, 4, 25)
-#         self.y_shape = (1, 1, 4, 25)
-#         self.trans_x = True
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (2, 20, 100)
+        self.y_shape = (100, 30)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp12(TestMatMulV2Op):
+    """
+    case 12
+    """
+
+    def config(self):
+        self.x_shape = (1, 20, 100)
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp13(TestMatMulV2Op):
@@ -238,38 +241,40 @@ class TestMatMuklOp13(TestMatMulV2Op):
         self.trans_y = False
 
 
-# class TestMatMuklOp14(TestMatMulV2Op):
-#     """
-#     case 14_1
-#     """
+class TestMatMuklOp14(TestMatMulV2Op):
+    """
+    case 14_1
+    """
 
-#     def config(self):
-#         self.x_shape = (3, 1, 6, 6)
-#         self.y_shape = (1, 2, 6, 9)
-#         self.trans_x = True
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (100, 2, 100, 10)
+        self.y_shape = (100, 2, 10, 90)
+        self.trans_x = False
+        self.trans_y = False
 
-# class TestMatMuklOp15(TestMatMulV2Op):
-#     """
-#     case 14_2
-#     """
 
-#     def config(self):
-#         self.x_shape = (3, 1, 6, 6)
-#         self.y_shape = (1, 2, 6, 9)
-#         self.trans_x = False
-#         self.trans_y = False
+class TestMatMuklOp15(TestMatMulV2Op):
+    """
+    case 14_2
+    """
 
-# class TestMatMuklOp16(TestMatMulV2Op):
-#     """
-#     case 16 : to check the gradient for special case
-#     """
+    def config(self):
+        self.x_shape = (100, 2, 100, 10)
+        self.y_shape = (100, 2, 100, 10)
+        self.trans_x = False
+        self.trans_y = True
 
-#     def config(self):
-#         self.x_shape = (100)
-#         self.y_shape = (1, 2, 2, 100, 2)
-#         self.trans_x = False
-#         self.trans_y = False
+
+class TestMatMuklOp16(TestMatMulV2Op):
+    """
+    case 16 : to check the big data
+    """
+
+    def config(self):
+        self.x_shape = (1000, 2, 100, 100)
+        self.y_shape = (1000, 2, 100, 900)
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp17(TestMatMulV2Op):
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index c422bacdf7..662515f0e5 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -14,7 +14,6 @@
 
 from . import optimizer
 from ..fluid.contrib import reader
-from ..fluid import load_op_library
 from ..fluid.layer_helper import LayerHelper
 
 __all__ = []
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c223addc26..1dad1632e2 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,7 +28,7 @@ from ...fluid.layers import dice_loss  #DEFINE_ALIAS
 from ...fluid.layers import log_loss  #DEFINE_ALIAS
 from ...fluid.layers import npair_loss  #DEFINE_ALIAS
 from ...fluid.layers import reshape
-from ...fluid.layers import softmax_with_cross_entropy  #DEFINE_ALIAS
+from ...fluid.layers import softmax_with_cross_entropy as fluid_softmax_with_cross_entropy  #DEFINE_ALIAS
 from ...fluid.layers import square_error_cost  #DEFINE_ALIAS
 
 from ...fluid.layers import edit_distance  #DEFINE_ALIAS
@@ -36,6 +37,7 @@ from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.framework import _varbase_creator
 from ...fluid.framework import Variable
+from paddle.utils import deprecated
 
 __all__ = [
     'binary_cross_entropy',
@@ -682,7 +684,6 @@ def l1_loss(input, label, reduction='mean', name=None):
 
             import paddle
 
-            paddle.disable_static()
             input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]])
             label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]])
 
@@ -1112,6 +1113,19 @@ def ctc_loss(log_probs,
     return loss_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.cross_entropy")
+def softmax_with_cross_entropy(logits,
+                               label,
+                               soft_label=False,
+                               ignore_index=-100,
+                               numeric_stable_mode=True,
+                               return_softmax=False,
+                               axis=-1):
+    return fluid_softmax_with_cross_entropy(logits, label, soft_label,
+                                            ignore_index, numeric_stable_mode,
+                                            return_softmax, axis)
+
+
 def cross_entropy(input,
                   label,
                   weight=None,
@@ -1119,87 +1133,248 @@ def cross_entropy(input,
                   reduction='mean',
                   soft_label=False,
                   axis=-1,
+                  use_softmax=True,
                   name=None):
     r"""
-    This operator implements the cross entropy loss function with softmax. This function 
+    By default, this operator implements the cross entropy loss function with softmax. This function 
     combines the calculation of the softmax operation and the cross entropy loss function 
-    to provide a more numerically stable gradient.
-    Because this operator performs a softmax on logits internally, it expects
-    unscaled logits. This operator should not be used with the output of
-    softmax operator since that would produce incorrect results.
+    to provide a more numerically stable computing. 
 
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
-    single label.
+    This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    The equation is as follows:
+    By default, this operator will calculate the mean of the result, and you can also affect 
+    the default behavior by using the reduction parameter. Please refer to the part of 
+    parameters for details.
 
-    1) Hard label (one-hot label, so every sample has exactly one class)
+    This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
-    .. math::
+    The calculation of this operator includes the following two steps.
 
-        loss_j =  -\\text{logits}_{label_j} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
+    - **1.softmax cross entropy**
 
-    2) Soft label (each sample can have a distribution over all classes)
+        1. Hard label (each sample can only be assigned into one category)
 
-    .. math::
+        1.1. when use_softmax=True
 
-        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
-        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
-        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+            .. math::
+              \\loss_j=-\text{logits}_{label_j}+\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right) , j = 1,...,N
 
- 
-    It is useful when training a classification problem with ``C`` classes.
+            where, N is the number of samples and C is the number of categories.
+
+        1.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\log\left({P}_{label_j}\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+        2. Soft label (each sample is assigned to multiple categories with a certain probability, and the probability sum is 1).
+
+        2.1. when use_softmax=True
+
+            .. math::
+              \\loss_j=-\sum_{i=0}^{C}\text{label}_i\left(\text{logits}_i-\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories.
+
+        2.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\sum_{j=0}^{C}\left({label}_j*\log\left({P}_{label_j}\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+
+
+    - **2. Weight and reduction processing**
+
+        1. Weight
+
+            If the ``weight`` parameter is ``None`` , go to the next step directly.
+
+            If the ``weight`` parameter is not ``None`` , the cross entropy of each sample is weighted by weight
+            according to soft_label = False or True as follows.
+
+            1.1. Hard labels (soft_label = False)
+
+            .. math::
+                \\loss_j=loss_j*weight[label_j] 
 
 
+            1.2. Soft labels (soft_label = True)
+
+             .. math::
+                \\loss_j=loss_j*\sum_{i}\left(weight[label_i]*logits_i\right)
+
+        2. reduction
+
+            2.1 if the ``reduction`` parameter is ``none`` 
+
+                Return the previous result directly
+
+            2.2 if the ``reduction`` parameter is ``sum`` 
+
+                Return the sum of the previous results
+
+            .. math::
+               \\loss=\sum_{j}loss_j
+
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
+            the ``weight`` parameter as follows. 
+
+            2.3.1. If the  ``weight``  parameter is ``None`` 
+
+                   Return the average value of the previous results
+
+             .. math::
+                \\loss=\sum_{j}loss_j/N
+
+                  where, N is the number of samples and C is the number of categories.
+
+            2.3.2. If the 'weight' parameter is not 'None', the weighted average value of the previous result will be returned
+
+            1. Hard labels (soft_label = False)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+
+            2. Soft labels (soft_label = True)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
+ 
+ 
     Parameters:
-        input (Tensor): Input tensor, the data type is float32, float64. Shape is
-	    (N, C), where C is number of classes, and if shape is more than 2D, this
-	    is (N, D1, D2,..., Dk, C), k >= 1.
-        label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
-	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
-	    (N, D1, D2,..., Dk), k >= 1.
-        weight (Tensor, optional):a manual rescaling weight given to each class. 
+
+        - **input** (Tensor)
+
+            Input tensor, the data type is float32, float64. Shape is
+	    :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+
+            Note: 
+
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                output of softmax operator, which will produce incorrect results.
+
+                2. when use_softmax=False, it expects the output of softmax operator.
+ 
+        - **label** (Tensor)
+
+            1. If soft_label=False, the shape is
+            :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
+            the data type is int32, int64, float32, float64, where each value is [0, C-1].
+
+            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            and the sum of the labels for each sample should be 1.
+
+        - **weight** (Tensor, optional)
+
+            a manual rescaling weight given to each class. 
             If given, has to be a Tensor of size C and the data type is float32, float64. 
-            Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size,
+            Default is ``'None'`` .
+
+        - **ignore_index** (int64, optional)
+
+            Specifies a target value that is ignored
+            and does not contribute to the loss. A negative value means that no label 
+            value needs to be ignored. Only valid when soft_label = False.  
+            Default is ``-100`` .
+
+        - **reduction** (str, optional)
+
+            Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-        ignore_index (int64, optional): Specifies a target value that is ignored
-            and does not contribute to the input gradient. Default is ``-100``.
-        soft_label (bool): indicate whether label is soft. Default False, meaning that
-                the label is hard. If soft_label=True, the label is soft.
-        axis (int, optional): The index of dimension to perform softmax calculations. It 
-                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
-                              is the rank of input :attr:`logits`. Default: -1.
 
+        - **soft_label** (bool, optional)
+
+            Indicate whether label is soft. 
+            Default is ``False``.
+
+        - **axis** (int, optional)
+
+            The index of dimension to perform softmax calculations. 
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the 
+            number of dimensions of input :attr:`input`. 
+            Default is ``-1`` .
+
+        - **use_softmax** (bool, optional)
+
+            Indicate whether compute softmax before cross_entropy.
+            Default is ``True``.
+
+        - **name** (str，optional)
+
+            The name of the operator. Default is ``None`` .
+            For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Tensor.The tensor storing the cross_entropy_loss of input and label.
 
+        Tensor. Return the softmax cross_entropy loss of ``input`` and ``label``.
+        The data type is the same as input.
 
-    Examples:
-        .. code-block:: python
+        If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
 
-            import paddle
-            import numpy as np
+        If :attr:`reduction` is ``'none'``:
 
-            input_data = np.random.random([5, 100]).astype("float64")
-            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
-            weight_data = np.random.random([100]).astype("float64")
+        1. If soft_label = False, the dimension of return value is the same with ``label`` . 
 
-            input =  paddle.to_tensor(input_data)
-            label =  paddle.to_tensor(label_data)
-            weight = paddle.to_tensor(weight_data)
+        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+
+
+     Example1(hard labels):
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.seed(99999)
+            N=100
+            C=200
+            reduction='mean'
+            input =  paddle.rand([N, C], dtype='float64')  
+            label =  paddle.randint(0, C, shape=[N], dtype='int64')
+            weight = paddle.rand([C], dtype='float64') 
+            
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=reduction)
+            dy_ret = cross_entropy_loss(
+                                       input,
+                                       label)
+            print(dy_ret.numpy()) #[5.41993642]
+
+
+    Example2(soft labels):
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.seed(99999)
+            axis = -1
+            ignore_index = -100
+            N = 4
+            C = 3
+            shape = [N, C]
+            reduction='mean'
+            weight = None
+            logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels /= paddle.sum(labels, axis=axis, keepdim=True)
+            paddle_loss_mean = paddle.nn.functional.cross_entropy(
+                                                                  logits,  
+                                                                  labels, 
+                                                                  soft_label=True, 
+                                                                  axis=axis,
+                                                                  weight=weight,
+                                                                  reduction=reduction)
+            print(paddle_loss_mean.numpy()) #[1.12908343]
 
-            loss = paddle.nn.functional.cross_entropy(input=input, label=label, weight=weight)
-            print(loss)
-            # [4.28546723]
     """
 
     if reduction not in ['sum', 'mean', 'none']:
@@ -1207,6 +1382,12 @@ def cross_entropy(input,
             "The value of 'reduction' in softmax_cross_entropy"
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
             % reduction)
+    if ignore_index > 0 and soft_label == True:
+        raise ValueError(
+            "When soft_label == True, the value of 'ignore_index' in softmax_cross_entropy"
+            "should be '-100', but received %s, which is not allowed." %
+            ignore_index)
+
     input_dims = len(list(input.shape))
     label_dims = len(list(label.shape))
     if input_dims - 1 != label_dims and input_dims != label_dims:
@@ -1216,27 +1397,46 @@ def cross_entropy(input,
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
     if in_dygraph_mode():
-        out = softmax_with_cross_entropy(
-            input,
-            label,
-            soft_label=soft_label,
-            ignore_index=ignore_index,
-            axis=axis)
+        _, out = core.ops.softmax_with_cross_entropy(
+            input, label, 'soft_label', soft_label, 'ignore_index',
+            ignore_index, 'numeric_stable_mode', True, 'axis', axis,
+            'use_softmax', use_softmax)
+
         if weight is not None:
-            weight_gather = core.ops.gather_nd(
-                weight, label)  #trans weight from class to sample, shape:N
-            input_shape = list(label.shape)
-            weight_gather_reshape = reshape(weight_gather, shape=input_shape)
-            out = core.ops.elementwise_mul(out, weight_gather_reshape)
+
+            #trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
+            if soft_label == True:
+                # chajchaj:
+                # weight's shape is C, where C is class num.
+                # for 1d case: label's shape is [N,C], weight_gather's shape is N.
+                # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
+                weight_gather = paddle.matmul(
+                    x=paddle.cast(label, weight.dtype),
+                    y=weight,
+                    transpose_x=False,
+                    transpose_y=True)
+                out_shape = list(out.shape)
+                weight_gather_reshape = reshape(weight_gather, shape=out_shape)
+                out = paddle.cast(out, weight_gather_reshape.dtype)
+
+                out = core.ops.elementwise_mul(out, weight_gather_reshape)
+
+            else:
+                weight_gather = core.ops.gather_nd(weight, label)
+                input_shape = list(label.shape)
+                weight_gather_reshape = reshape(
+                    weight_gather, shape=input_shape)
+                out = paddle.cast(out, weight_gather_reshape.dtype)
+                out = core.ops.elementwise_mul(out, weight_gather_reshape)
 
         if reduction == "sum":
-            #   because of softmax_with_cross_entropy op's inner logic, 
+            #   because of fluid_softmax_with_cross_entropy op's inner logic, 
             #   in the out tensor of this op, the loss of sample with class_index==ignore_index is 0
             #   so, reduce_sum all directly is ok
             return core.ops.reduce_sum(out, 'reduce_all', True)
         elif reduction == "mean":
             #1. if weight==none, 
-            #    numerator: reduce_sum all loss directly is ok causeof softmax_with_cross_entropy's inner logic
+            #    numerator: reduce_sum all loss directly is ok causeof fluid_softmax_with_cross_entropy's inner logic
             #    denominator: count sample num with class_index!=ignore_index
             #2. else
             #    numerator: loss's weighted sum 
@@ -1247,7 +1447,7 @@ def cross_entropy(input,
                 #mask[i]=0, if label[i]==ignore_index
                 #mask[i]=1, otherwise 
                 mask = (label != ignore_index)
-                if (weight is None):
+                if weight is None:
                     mask = paddle.cast(mask, dtype=out_sum.dtype)
                     count = core.ops.reduce_sum(mask, 'reduce_all', True)
                     ret = out_sum / count
@@ -1277,20 +1477,48 @@ def cross_entropy(input,
     fluid.data_feeder.check_variable_and_dtype(
         label, 'label', ['int32', 'int64', 'float32', 'float64'],
         'softmax_cross_entropy')
-    out = softmax_with_cross_entropy(
-        input,
-        label,
-        soft_label=soft_label,
-        ignore_index=ignore_index,
-        axis=axis)
+    attrs = {
+        'soft_label': soft_label,
+        'ignore_index': ignore_index,
+        'numeric_stable_mode': True,
+        'axis': axis,
+        'use_softmax': use_softmax
+    }
+    helper = LayerHelper('softmax_with_cross_entropy', **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': input,
+                'Label': label},
+        outputs={'Softmax': softmax,
+                 'Loss': out},
+        attrs=attrs)
+
     if weight is not None:
         fluid.data_feeder.check_variable_and_dtype(
             weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy')
         weight_name = name if reduction == 'none' else None
-        weight_gather = paddle.gather_nd(
-            weight, label)  #trans weight from class to sample, shape:N
-        input_shape = list(label.shape)
-        weight_gather_reshape = reshape(weight_gather, shape=input_shape)
+        if soft_label == True:
+            # chajchaj:
+            #trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
+            # weight's shape is C, where C is class num.
+            # for 1d case: label's shape is [N,C], weight_gather's shape is N.
+            # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
+            weight_gather = paddle.matmul(
+                x=paddle.cast(label, weight.dtype),
+                y=weight,
+                transpose_x=False,
+                transpose_y=True)
+
+            out_shape = list(out.shape)
+            weight_gather_reshape = reshape(weight_gather, shape=out_shape)
+            out = paddle.cast(out, weight_gather_reshape.dtype)
+        else:
+            weight_gather = paddle.gather_nd(
+                weight, label)  #trans weight from class to sample, shape:N
+            input_shape = list(label.shape)
+            weight_gather_reshape = reshape(weight_gather, shape=input_shape)
         out = paddle.multiply(out, weight_gather_reshape, name=weight_name)
 
     if reduction == "sum":
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 03ba78e12f..54824233f7 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -188,10 +188,10 @@ def batch_norm(x,
 
     if in_dygraph_mode():
         # for dygraph need tuple
-        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
-                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
-                 "use_global_stats", use_global_stats, "trainable_statistics",
-                 trainable_statistics)
+        attrs = ("momentum", momentum, "epsilon", epsilon, "is_test",
+                 not training, "data_layout", data_format, "use_mkldnn", False,
+                 "fuse_with_relu", False, "use_global_stats", use_global_stats,
+                 "trainable_statistics", trainable_statistics)
         batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
             x, weight, bias, running_mean, running_var, mean_out, variance_out,
             *attrs)
@@ -205,6 +205,7 @@ def batch_norm(x,
     attrs = {
         "momentum": momentum,
         "epsilon": epsilon,
+        "is_test": not training,
         "data_layout": data_format,
         "use_mkldnn": False,
         "fuse_with_relu": False,
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index d0f97625bc..60c846f9f7 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -578,7 +578,7 @@ class Bilinear(layers.Layer):
 
     .. math::
 
-      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1
+      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,outfeatures-1
 
       out = out + b
 
@@ -586,7 +586,7 @@ class Bilinear(layers.Layer):
      - :math:`x1`: the first input contains in1_features elements, shape is [batch_size, in1_features].
      - :math:`x2`: the second input contains in2_features elements, shape is [batch_size, in2_features].
      - :math:`W_{i}`: the i-th learned weight, shape is [in1_features, in2_features], and learned weight's shape is [out_features, in1_features, in2_features].
-     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, out_features].
+     - :math:`out_{i}`: the i-th element of out, shape is [batch_size], and out's shape is [batch_size, out_features].
      - :math:`b`: the learned bias, shape is [1, out_features].
      - :math:`x2^\mathrm{T}`: the transpose of :math:`x2`.
 
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 389920b923..d65b874c8b 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -25,6 +25,7 @@ __all__ = [
 
 import numpy as np
 
+from ...fluid import get_flags
 from ...fluid import core
 from ...device import get_cudnn_version
 from ...fluid.dygraph import layers
@@ -644,6 +645,10 @@ class Conv2D(_ConvNd):
             bias_attr=bias_attr,
             data_format=data_format)
 
+        if (core.is_compiled_with_cuda() and get_flags(
+                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+            self._use_cudnn = False
+
     def forward(self, x):
         if self._padding_mode != 'zeros':
             x = F.pad(x,
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index ac1cb5a818..ad046b9041 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -108,7 +109,6 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):
 
         .. code-block:: python
             import paddle
-            paddle.disable_static()
             logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
             label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
             bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
@@ -142,85 +142,249 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):
 
 class CrossEntropyLoss(fluid.dygraph.Layer):
     r"""
-    This operator implements the cross entropy loss function with softmax. This function 
+    By default, this operator implements the cross entropy loss function with softmax. This function 
     combines the calculation of the softmax operation and the cross entropy loss function 
-    to provide a more numerically stable gradient.
+    to provide a more numerically stable computing.
 
-    Because this operator performs a softmax on logits internally, it expects
-    unscaled logits. This operator should not be used with the output of
-    softmax operator since that would produce incorrect results.
+    This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
-    single label.
+    By default, this operator will calculate the mean of the result, and you can also affect 
+    the default behavior by using the reduction parameter. Please refer to the part of 
+    parameters for details.
 
-    The equation is as follows:
+    This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
-    1) Hard label (one-hot label, so every sample has exactly one class)
+    The calculation of this operator includes the following two steps.
 
-    .. math::
+    -  **I.softmax cross entropy** 
 
-        loss_j =  -\\text{logits}_{label_j} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
+        1. Hard label (each sample can only be assigned into one category)
 
-    2) Soft label (each sample can have a distribution over all classes)
+        1.1. when use_softmax=True
 
-    .. math::
+            .. math::
+              \\loss_j=-\text{logits}_{label_j}+\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right) , j = 1,...,N
 
-        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
-        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
-        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+            where, N is the number of samples and C is the number of categories.
+
+        1.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\log\left({P}_{label_j}\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+        2. Soft label (each sample is assigned to multiple categories with a certain probability, and the probability sum is 1).
+
+        2.1. when use_softmax=True
+
+            .. math::
+              \\loss_j=-\sum_{i=0}^{C}\text{label}_i\left(\text{logits}_i-\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories.
+
+        2.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\sum_{j=0}^{C}\left({label}_j*\log\left({P}_{label_j}\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+
+    -  **II.Weight and reduction processing** 
+
+        1. Weight
+
+            If the ``weight`` parameter is ``None`` , go to the next step directly.
+
+            If the ``weight`` parameter is not ``None`` , the cross entropy of each sample is weighted by weight
+            according to soft_label = False or True as follows.
+
+            1.1. Hard labels (soft_label = False)
+
+            .. math::
+                \\loss_j=loss_j*weight[label_j] 
 
- 
-    It is useful when training a classification problem with ``C`` classes.
 
+            1.2. Soft labels (soft_label = True)
 
+             .. math::
+                \\loss_j=loss_j*\sum_{i}\left(weight[label_i]*logits_i\right)
+
+        2. reduction
+
+            2.1 if the ``reduction`` parameter is ``none`` 
+
+            Return the previous result directly
+
+            2.2 if the ``reduction`` parameter is ``sum`` 
+
+            Return the sum of the previous results
+
+            .. math::
+               \\loss=\sum_{j}loss_j
+
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
+            the ``weight`` parameter as follows. 
+
+            2.3.1. If the  ``weight``  parameter is ``None`` 
+
+            Return the average value of the previous results
+
+             .. math::
+                \\loss=\sum_{j}loss_j/N
+
+            where, N is the number of samples and C is the number of categories.
+
+            2.3.2. If the 'weight' parameter is not 'None', the weighted average value of the previous result will be returned
+
+            1. Hard labels (soft_label = False)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+
+            2. Soft labels (soft_label = True)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
+ 
+ 
     Parameters:
-        input (Tensor): Input tensor, the data type is float32, float64. Shape is
-	    (N, C), where C is number of classes, and if shape is more than 2D, this
-	    is (N, C, D1, D2,..., Dk), k >= 1.
-        label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
-	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
-	    (N, D1, D2,..., Dk), k >= 1.
-        weight (Tensor, optional): Weight tensor, a manual rescaling weight given
-            to each class and the shape is (C). It has the same dimensions as class
-	    number and the data type is float32, float64. Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size,
+
+        - **weight** (Tensor, optional)
+
+            a manual rescaling weight given to each class. 
+            If given, has to be a Tensor of size C and the data type is float32, float64. 
+            Default is ``'None'`` .
+
+        - **ignore_index** (int64, optional)
+
+            Specifies a target value that is ignored
+            and does not contribute to the loss. A negative value means that no label 
+            value needs to be ignored. Only valid when soft_label = False.  
+            Default is ``-100`` .
+
+        - **reduction** (str, optional)
+
+            Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-        ignore_index (int64, optional): Specifies a target value that is ignored
-            and does not contribute to the input gradient. Default is ``-100``.
-        soft_label (bool): indicate whether label is soft. Default False, meaning that
-                the label is hard. If soft_label=True, the label is soft.
-        axis (int, optional): The index of dimension to perform softmax calculations. It 
-                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
-                              is the rank of input :attr:`logits`. Default: -1.
 
+        - **soft_label** (bool, optional)
 
-    Returns:
-        Tensor. The tensor storing the cross_entropy_loss of input and label.
+            Indicate whether label is soft. 
+            If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
+            Default is ``False``.
 
+        - **axis** (int, optional)
+
+            The index of dimension to perform softmax calculations. 
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number 
+            of dimensions of input :attr:`input`. 
+            Default is ``-1`` .
+
+        - **use_softmax** (bool, optional)
+
+            Indicate whether compute softmax before cross_entropy.
+            Default is ``True``.
+
+        - **name** (str，optional)
+
+            The name of the operator. Default is ``None`` .
+            For more information, please refer to :ref:`api_guide_Name` .
+
+
+    Shape:
+
+        - **input** (Tensor)
+
+            Input tensor, the data type is float32, float64. Shape is
+	    :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+
+            Note: 
+
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                output of softmax operator, which will produce incorrect results.
+
+                2. when use_softmax=False, it expects the output of softmax operator.
+ 
+
+        - **label** (Tensor)
+
+            1. If soft_label=False，the shape is 
+            :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
+            the data type is int32, int64, float32, float64, where each value is [0, C-1].
+
+            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            and the sum of the labels for each sample should be 1.
+ 
+        - **output** (Tensor)
+
+            Return the softmax cross_entropy loss of ``input`` and ``label``.
+
+            The data type is the same as input.
+
+            If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
+
+            If :attr:`reduction` is ``'none'``:
+
+            1. If soft_label = False, the dimension of return value is the same with ``label`` . 
+
+            2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+
+     Example1(hard labels):
 
-    Examples:
         .. code-block:: python
             
             import paddle
-            import numpy as np
+            paddle.seed(99999)
+            N=100
+            C=200
+            reduction='mean'
+            input =  paddle.rand([N, C], dtype='float64')  
+            label =  paddle.randint(0, C, shape=[N], dtype='int64')
+            weight = paddle.rand([C], dtype='float64') 
+            
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=reduction)
+            dy_ret = cross_entropy_loss(
+                                       input,
+                                       label)
+            print(dy_ret.numpy()) #[5.41993642]
+
+
+    Example2(soft labels):
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.seed(99999)
+            axis = -1
+            ignore_index = -100
+            N = 4
+            C = 3
+            shape = [N, C]
+            reduction='mean'
+            weight = None
+            logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels /= paddle.sum(labels, axis=axis, keepdim=True)
+            paddle_loss_mean = paddle.nn.functional.cross_entropy(
+                                                                  logits,  
+                                                                  labels, 
+                                                                  soft_label=True, 
+                                                                  axis=axis,
+                                                                  weight=weight,
+                                                                  reduction=reduction)
+            print(paddle_loss_mean.numpy()) #[1.12908343]
 
-            input_data = paddle.uniform([5, 100], dtype="float64")
-            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
-            weight_data = np.random.random([100]).astype("float64")
-            input =  paddle.to_tensor(input_data)
-            label =  paddle.to_tensor(label_data)
-            weight = paddle.to_tensor(weight_data)
-            ce_loss = paddle.nn.CrossEntropyLoss(weight=weight, reduction='mean')
-            output = ce_loss(input, label)
-            print(output)
-            # [4.84496039]
     """
 
     def __init__(self,
@@ -229,6 +393,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
                  reduction='mean',
                  soft_label=False,
                  axis=-1,
+                 use_softmax=True,
                  name=None):
         super(CrossEntropyLoss, self).__init__()
         self.weight = weight
@@ -236,6 +401,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
         self.ignore_index = ignore_index
         self.soft_label = soft_label
         self.axis = axis
+        self.use_softmax = use_softmax
         self.name = name
 
     def forward(self, input, label):
@@ -247,6 +413,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
             reduction=self.reduction,
             soft_label=self.soft_label,
             axis=self.axis,
+            use_softmax=self.use_softmax,
             name=self.name)
 
         return ret
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index b0c05cf8de..0cafbda893 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -351,7 +351,7 @@ class Adam(Optimizer):
         """
         params_grads = []
         for param in self._parameter_list:
-            if not param.trainable:
+            if param.stop_gradient:
                 continue
             if param._grad_ivar() is not None:
                 grad_var = param._grad_ivar()
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index bd65fc19c3..4a6c2278a4 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -184,7 +184,7 @@ class Adamax(Optimizer):
         """
         assert isinstance(block, framework.Block)
         for param, grad in parameters_and_grads:
-            if grad is None or param.trainable is False:
+            if grad is None or param.stop_gradient is True:
                 continue
             with param.block.program._optimized_guard(
                 [param, grad]), name_scope('adamax'):
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 5085911ce9..484b4fb724 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -786,9 +786,8 @@ class LinearWarmup(LRScheduler):
                 self.last_epoch) / float(self.warmup_steps) + self.start_lr
         else:
             if isinstance(self.learning_rate, LRScheduler):
-                lr_value = self.learning_rate()
-                self.learning_rate.step()
-                return lr_value
+                self.learning_rate.step(self.last_epoch - self.warmup_steps)
+                return self.learning_rate()
 
             return self.learning_rate
 
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 212dad7c77..b37d172606 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -542,7 +542,7 @@ class Optimizer(object):
 
     def _update_param_device_map(self, parameters_and_grads, target_block):
         for param_and_grad in parameters_and_grads:
-            if param_and_grad[0].trainable is True:
+            if param_and_grad[0].stop_gradient is False:
                 param_name = param_and_grad[0].name
                 ops = target_block.ops
                 device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
@@ -598,14 +598,14 @@ class Optimizer(object):
         self._update_param_device_map(parameters_and_grads, target_block)
         self._create_accumulators(
             target_block,
-            [p[0] for p in parameters_and_grads if p[0].trainable])
+            [p[0] for p in parameters_and_grads if not p[0].stop_gradient])
         self._create_global_learning_rate()
 
         if framework.in_dygraph_mode():
             for param_and_grad in parameters_and_grads:
                 if param_and_grad[1] is None:
                     continue
-                if param_and_grad[0].trainable is True:
+                if param_and_grad[0].stop_gradient is False:
                     self._append_optimize_op(target_block, param_and_grad)
         else:
             for param_and_grad in parameters_and_grads:
@@ -613,7 +613,7 @@ class Optimizer(object):
                     continue
                 with param_and_grad[0].block.program._optimized_guard(
                         param_and_grad), name_scope("optimizer"):
-                    if param_and_grad[0].trainable is True:
+                    if param_and_grad[0].stop_gradient is False:
                         device = self._get_device_for_param(param_and_grad[0]
                                                             .name)
                         with device_guard(device):
@@ -689,7 +689,7 @@ class Optimizer(object):
 
             params_grads = []
             for param in parameter_list:
-                if not param.trainable:
+                if param.stop_gradient:
                     continue
                 if param._grad_ivar() is not None:
                     # create gradient tensor
@@ -789,8 +789,9 @@ class Optimizer(object):
     def _get_no_grad_set(self, loss, no_grad_set=None):
         no_grad_set = _get_no_grad_set_name(no_grad_set)
         parameters = loss.block.program.global_block().all_parameters()
-        param_no_trainable = set(
-            [param.name for param in parameters if param.trainable is False])
+        param_no_trainable = set([
+            param.name for param in parameters if param.stop_gradient is True
+        ])
         # If the parameter is no trainable, it should not have a gradient.
         no_grad_set.update(param_no_trainable)
 
@@ -825,7 +826,7 @@ class Optimizer(object):
 
         """
         for p in self._parameter_list:
-            if p.trainable:
+            if not p.stop_gradient:
                 p.clear_gradient()
 
     @imperative_base.no_grad
@@ -920,7 +921,7 @@ class Optimizer(object):
         """
         params_grads = []
         for param in self._parameter_list:
-            if not param.trainable:
+            if param.stop_gradient:
                 continue
             if param._grad_ivar() is not None:
                 grad_var = param._grad_ivar()
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 056a022672..69ee296230 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -168,7 +168,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             data = data.astype(default_type)
 
     if dtype and convert_dtype(dtype) != data.dtype:
-        data = data.astype(dtype)
+        data = data.astype(convert_dtype(dtype))
 
     return paddle.Tensor(
         value=data,
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 1db1b66426..d32fa4c88c 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -20,7 +20,6 @@ from .lazy_import import try_import
 from .op_version import OpLastCheckpointChecker
 from .install_check import run_check
 from ..fluid.framework import unique_name
-from ..fluid.framework import load_op_library
 from ..fluid.framework import require_version
 
 from . import download
@@ -30,4 +29,4 @@ from . import cpp_extension
 __all__ = ['dump_config', 'deprecated', 'download', 'run_check']
 
 #TODO: define new api under this directory
-__all__ += ['unique_name', 'load_op_library', 'require_version']
+__all__ += ['unique_name', 'require_version']
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index d84ae67fff..606f5465e1 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -26,7 +26,7 @@ from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension
 from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags
 from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile
 from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from
-from .extension_utils import use_new_custom_op_load_method, clean_object_if_change_cflags
+from .extension_utils import clean_object_if_change_cflags
 from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat
 
 from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS
@@ -400,14 +400,14 @@ class BuildExtension(build_ext, object):
                 # ncvv compile CUDA source
                 if is_cuda_file(src):
                     if core.is_compiled_with_rocm():
-                        assert ROCM_HOME is not None
+                        assert ROCM_HOME is not None, "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it."
                         hipcc_cmd = os.path.join(ROCM_HOME, 'bin', 'hipcc')
                         self.compiler.set_executable('compiler_so', hipcc_cmd)
                         # {'nvcc': {}, 'cxx: {}}
                         if isinstance(cflags, dict):
                             cflags = cflags['hipcc']
                     else:
-                        assert CUDA_HOME is not None
+                        assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
                         nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
                         self.compiler.set_executable('compiler_so', nvcc_cmd)
                         # {'nvcc': {}, 'cxx: {}}
@@ -470,7 +470,7 @@ class BuildExtension(build_ext, object):
                 src = src_list[0]
                 obj = obj_list[0]
                 if is_cuda_file(src):
-                    assert CUDA_HOME is not None
+                    assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
                     nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
                     if isinstance(self.cflags, dict):
                         cflags = self.cflags['nvcc']
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 1ff42a7bcb..65655eaf48 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -28,7 +28,6 @@ import subprocess
 from contextlib import contextmanager
 from setuptools.command import bdist_egg
 
-from .. import load_op_library
 from ...fluid import core
 from ...fluid.framework import OpProtoHolder
 from ...sysconfig import get_include, get_lib
@@ -86,7 +85,6 @@ information
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 '''
-USING_NEW_CUSTOM_OP_LOAD_METHOD = True
 
 DEFAULT_OP_ATTR_NAMES = [
     core.op_proto_and_checker_maker.kOpRoleAttrName(),
@@ -97,18 +95,6 @@ DEFAULT_OP_ATTR_NAMES = [
 ]
 
 
-# NOTE(chenweihang): In order to be compatible with
-# the two custom op define method, after removing
-# old method, we can remove them together
-def use_new_custom_op_load_method(*args):
-    global USING_NEW_CUSTOM_OP_LOAD_METHOD
-    if len(args) == 0:
-        return USING_NEW_CUSTOM_OP_LOAD_METHOD
-    else:
-        assert len(args) == 1 and isinstance(args[0], bool)
-        USING_NEW_CUSTOM_OP_LOAD_METHOD = args[0]
-
-
 @contextmanager
 def bootstrap_context():
     """
@@ -122,10 +108,7 @@ def bootstrap_context():
 
 
 def load_op_meta_info_and_register_op(lib_filename):
-    if USING_NEW_CUSTOM_OP_LOAD_METHOD:
-        core.load_op_meta_info_and_register_op(lib_filename)
-    else:
-        core.load_op_library(lib_filename)
+    core.load_op_meta_info_and_register_op(lib_filename)
     return OpProtoHolder.instance().update_op_proto()
 
 
@@ -406,10 +389,7 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
 
         # append link flags
         extra_link_args = kwargs.get('extra_link_args', [])
-        if use_new_custom_op_load_method():
-            extra_link_args.append('-lpaddle_custom_op')
-        else:
-            extra_link_args.append('-lpaddle_framework')
+        extra_link_args.append('-lpaddle_custom_op')
         if use_cuda:
             extra_link_args.append('-lcudart')
 
@@ -461,9 +441,6 @@ def find_cuda_home():
     if cuda_home and not os.path.exists(
             cuda_home) and core.is_compiled_with_cuda():
         cuda_home = None
-        warnings.warn(
-            "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
-        )
 
     return cuda_home
 
@@ -494,9 +471,6 @@ def find_rocm_home():
     if rocm_home and not os.path.exists(
             rocm_home) and core.is_compiled_with_rocm():
         rocm_home = None
-        warnings.warn(
-            "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it."
-        )
 
     return rocm_home
 
@@ -817,9 +791,7 @@ def _write_setup_file(name,
     import os
     from paddle.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension, setup
     from paddle.utils.cpp_extension import get_build_directory
-    from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
 
-    use_new_custom_op_load_method({use_new_method})
 
     setup(
         name='{name}',
@@ -847,8 +819,7 @@ def _write_setup_file(name,
         extra_cxx_cflags=list2str(extra_cxx_cflags),
         extra_cuda_cflags=list2str(extra_cuda_cflags),
         extra_link_args=list2str(link_args),
-        build_dir=build_dir,
-        use_new_method=use_new_custom_op_load_method())
+        build_dir=build_dir)
 
     log_v('write setup.py into {}'.format(file_path), verbose)
     with open(file_path, 'w') as f:
@@ -904,11 +875,7 @@ def parse_op_name_from(sources):
     """
 
     def regex(content):
-        if USING_NEW_CUSTOM_OP_LOAD_METHOD:
-            pattern = re.compile(r'PD_BUILD_OP\(([^,\)]+)\)')
-        else:
-            pattern = re.compile(r'REGISTER_OPERATOR\(([^,]+),')
-
+        pattern = re.compile(r'PD_BUILD_OP\(([^,\)]+)\)')
         content = re.sub(r'\s|\t|\n', '', content)
         op_name = pattern.findall(content)
         op_name = set([re.sub('_grad', '', name) for name in op_name])
diff --git a/python/setup.py.in b/python/setup.py.in
index 69a8bc771a..868cb97ce0 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -347,11 +347,14 @@ if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '':
         shutil.copy(xpu_rt_lib, libs_path)
         package_data['paddle.libs']+=['libxpurt.so']
 
+<<<<<<< HEAD
 ### Old custom op extension mechanism related, will be removed in 2.1.0 ###
 # copy libpaddle_framework.so to libs on linux
 if sys.platform.startswith('linux'):
     shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
     package_data['paddle.libs'] += ['libpaddle_framework.so']
+=======
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
 ### New custom op extension mechanism related ###
 # copy libpaddle_custom_op.so to libs on linux
@@ -405,25 +408,8 @@ def find_files(pattern, root):
 
 headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/framework')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/imperative')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/memory')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
-    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
-    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
-    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage
-    ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen
-    list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen
-    list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen
-    list(find_files('*', '${GFLAGS_INSTALL_DIR}/include')) + # gflags
-    list(find_files('*', '${GLOG_INSTALL_DIR}/include')) + # glog
-    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')) + # boost
-    list(find_files('*', '${XXHASH_INSTALL_DIR}/include')) + # xxhash
-    list(find_files('*', '${PROTOBUF_INCLUDE_DIR}')) + # protobuf
-    list(find_files('*', '${DLPACK_INCLUDE_DIR}')) + # dlpack
-    list(find_files('*.h', '${THREADPOOL_INCLUDE_DIR}'))) # threadpool
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) +  # extension
+    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')))  # boost
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -463,6 +449,7 @@ class InstallHeaders(Command):
                                    ('install_headers', 'install_dir'),
                                    ('force', 'force'))
 
+<<<<<<< HEAD
     def copy_data_type_headers(self, header):
         if os.name == 'nt':
             data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h', 'platform\\float16.h']
@@ -474,6 +461,20 @@ class InstallHeaders(Command):
                 if not os.path.exists(install_dir):
                     self.mkpath(install_dir)
                 return self.copy_file(header, install_dir)
+=======
+    def copy_data_type_headers(self):
+        # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
+        # to `extension/incude`,
+        data_type_headers = (['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] + 
+                             ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] + 
+                             ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
+
+        install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
+        if not os.path.exists(install_dir):
+            self.mkpath(install_dir)
+        for header in data_type_headers:
+            self.copy_file(header, install_dir)
+>>>>>>> 587d99ae443c684faa25d1fd261eb81d37cb32e4
 
     def mkdir_and_copy_file(self, header):
         if 'pb.h' in header:
@@ -509,6 +510,7 @@ class InstallHeaders(Command):
         for header in hdrs:
             (out, _) = self.mkdir_and_copy_file(header)
             self.outfiles.append(out)
+        self.copy_data_type_headers()
 
     def get_inputs(self):
         return self.distribution.headers or []
@@ -534,10 +536,10 @@ else:
 
 # Log for PYPI
 if sys.version_info > (3,0):
-    with open("@PADDLE_BINARY_DIR@/python/paddle/README.md", "r", encoding='UTF-8') as f:
+    with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r", encoding='UTF-8') as f:
         long_description = f.read()
 else:
-    with open("@PADDLE_BINARY_DIR@/python/paddle/README.md", "r")as f:
+    with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r")as f:
         long_description = unicode(f.read(), 'UTF-8')
 
 with redirect_stdout():
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 6453eb48d7..ab5b6516b9 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -606,6 +606,7 @@ STATIC_MODE_TESTING_LIST = [
     'test_fusion_gru_bf16_mkldnn_op',
     'test_fusion_gru_mkldnn_op',
     'test_fusion_lstm_mkldnn_op',
+    'test_fusion_lstm_int8_mkldnn_op',
     'test_fusion_lstm_bf16_mkldnn_op',
     'test_gaussian_random_mkldnn_op',
     'test_lrn_mkldnn_op',
diff --git a/tools/test_op_benchmark.sh b/tools/test_op_benchmark.sh
index f0937ca7df..4f7288eb12 100644
--- a/tools/test_op_benchmark.sh
+++ b/tools/test_op_benchmark.sh
@@ -187,7 +187,7 @@ function run_op_benchmark_test {
   done
   # install tensorflow for testing accuary
   pip install tensorflow==2.3.0 tensorflow-probability
-  for branch_name in "develop" "test_pr"
+  for branch_name in "develop" "test"
   do
     git checkout $branch_name
     [ $? -ne 0 ] && LOG "[FATAL] Missing branch ${branch_name}." && exit 7
@@ -263,7 +263,7 @@ function summary_problems {
   done
   if [ $exit_code -ne 0 ]; then
     LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details."
-    LOG "[INFO] Or you can apply for one RD (GaoWei8(Recommend), Xreki, luotao1) approval to pass this PR."
+    LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR."
     exit $exit_code
   fi
 }
-- 
Gitee