diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 5a755a816c332a2517ed61caa94d647afd557aae..f68db1eab3d877a19ab3ed88cb05d7ca342397cc 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -14,11 +14,11 @@
 
 include(ExternalProject)
 
-# update eigen to the commit id 4da2c6b1 on 03/19/2020
+# update eigen to the commit id f612df27 on 03/16/2021
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
 set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
-set(EIGEN_TAG        4da2c6b1974827b1999bab652a3d4703e1992d26)
+set(EIGEN_TAG        f612df273689a19d25b45ca4f8269463207c4fee)
 
 cache_third_party(extern_eigen3
     REPOSITORY    ${EIGEN_REPOSITORY}
@@ -27,48 +27,6 @@ cache_third_party(extern_eigen3
 
 if(WIN32)
     add_definitions(-DEIGEN_STRONG_INLINE=inline)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
-    # For Windows
-    # which will cause a compilation error in Tensor:74:
-    # "can not open file 'unistd.h'"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2)
-    # For VS2015
-    # which will cause a compilation error in TensorBlock.h:1028:
-    # "syntax error"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3)
-    set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y)
-elseif(LINUX)
-    # For gxx=4.8, __GXX_ABI_VERSION is less than 1004
-    # which will cause a compilation error in Geometry_SSE.h:38:
-    # "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)"
-    # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60
-    # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8
-    # so use following patch to solve compilation error with different version of gcc.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1)
-    # The compiler fully support const expressions since c++14,
-    # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11
-    # add patch to avoid compilation error in c++11
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2)
-    if(WITH_ROCM)
-        # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
-        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3)
-        # For HIPCC Eigen::internal::scalar_sum_op<bool,bool> is not EIGEN_DEVICE_FUNC
-        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4)
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4})
-    else()
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2})
-    endif()
 endif()
 
 set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
@@ -82,7 +40,7 @@ ExternalProject_Add(
     PREFIX          ${EIGEN_PREFIX_DIR}
     SOURCE_DIR      ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND    ""
-    PATCH_COMMAND   ${EIGEN_PATCH_COMMAND}
+    PATCH_COMMAND     ""
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""
     INSTALL_COMMAND   ""
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index e740771e5ca9fce832d86b48935493b9d334c6f3..f87a30bdd79d188c7ab7e2c1311ddbc877b9d752 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -169,4 +169,4 @@ void SectionWorker::TrainFiles() {
 
 }  // namespace framework
 }  // namespace paddle
-#endif
+#endif
\ No newline at end of file
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 21ef3b2312ff6d6bfeae6b3ce216af2bc9bc1db4..40bdc2aa0bb25020561dbc33c71b349a468d72cf 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1300,4 +1300,4 @@ Predictor *PredictorPool::Retrive(size_t idx) {
   return preds_[idx - 1].get();
 }
 }  // namespace services
-}  // namespace paddle_infer
+}  // namespace paddle_infer
\ No newline at end of file
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index c95912a931e0bc740775bd59a6bfcf30eb981c81..3c1803c38080332a3c5d6242fcafb4a0b165f7a6 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -306,4 +306,4 @@ OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
 
 }  // namespace tensorrt
 }  // namespace inference
-}  // namespace paddle
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index c6d2fbccd8e84bfd563353f23769d818bc721f3c..df5dedb5649a1822c3e0ee65067d44d855f501a7 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -468,4 +468,4 @@ REGISTER_OP_CUDA_KERNEL(
                              ops::LogGradGradFunctor<double>>,
     ops::LogDoubleGradKernel<plat::CUDADeviceContext,
                              ops::LogGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
+/* ========================================================================== */
\ No newline at end of file
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index bc7def61b2e249d3fa5e8b1d915b86f50beaf77b..fb5c4db91ec20233341d13ad02743c205472adbb 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -400,7 +400,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
+    out.device(d) = x * (temp1 + temp2).template cast<T>();
   }
 };
 
@@ -417,7 +417,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h
index a8ad729a31a4d2b3af9e524473238fb568015bb1..0db4cc71b1b21085513c4703475e651b8d8edd74 100644
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -24,7 +24,6 @@
 
 namespace Eigen {
 
-using bfloat16 = paddle::platform::bfloat16;
 using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
 using float16 = paddle::platform::float16;
@@ -33,7 +32,8 @@ template <typename T>
 struct NumTraits;
 
 template <>
-struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
+struct NumTraits<paddle::platform::bfloat16>
+    : GenericNumTraits<paddle::platform::bfloat16> {
   enum {
     IsSigned = true,
     IsInteger = false,
@@ -41,22 +41,22 @@ struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
     RequireInitialization = false
   };
 
-  HOSTDEVICE static inline bfloat16 epsilon() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 epsilon() {
     return paddle::platform::raw_uint16_to_bfloat16(0x3400);
   }
-  HOSTDEVICE static inline bfloat16 dummy_precision() {
-    return bfloat16(1e-5f);
+  HOSTDEVICE static inline paddle::platform::bfloat16 dummy_precision() {
+    return paddle::platform::bfloat16(1e-5f);
   }
-  HOSTDEVICE static inline bfloat16 highest() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 highest() {
     return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
   }
-  HOSTDEVICE static inline bfloat16 lowest() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 lowest() {
     return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
   }
-  HOSTDEVICE static inline bfloat16 infinity() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 infinity() {
     return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
   }
-  HOSTDEVICE static inline bfloat16 quiet_NaN() {
+  HOSTDEVICE static inline paddle::platform::bfloat16 quiet_NaN() {
     return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
   }
 };
@@ -137,68 +137,91 @@ namespace numext {
 //////////// bfloat methods /////////////
 
 template <>
-HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+HOSTDEVICE inline bool(isnan)(const paddle::platform::bfloat16& a) {
   return (paddle::platform::isnan)(a);
 }
 
 template <>
-HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+HOSTDEVICE inline bool(isinf)(const paddle::platform::bfloat16& a) {
   return (paddle::platform::isinf)(a);
 }
 
 template <>
-HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+HOSTDEVICE inline bool(isfinite)(const paddle::platform::bfloat16& a) {
   return (paddle::platform::isfinite)(a);
 }
 
 template <>
-HOSTDEVICE inline bfloat16 exp(const bfloat16& a) {
-  return bfloat16(::expf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 exp(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::expf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 erf(const bfloat16& a) {
-  return bfloat16(::erff(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 erf(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::erff(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 log(const bfloat16& a) {
-  return bfloat16(::logf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 log(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::logf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) {
-  return bfloat16(::tanhf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 tanh(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::tanhf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) {
-  return bfloat16(::sqrtf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 sqrt(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::sqrtf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) {
-  return bfloat16(::ceilf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 ceil(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::ceilf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 floor(const bfloat16& a) {
-  return bfloat16(::floorf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 floor(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::floorf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 round(const bfloat16& a) {
-  return bfloat16(::roundf(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 round(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::roundf(static_cast<float>(a)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(::powf(static_cast<float>(a), static_cast<float>(b)));
+HOSTDEVICE inline paddle::platform::bfloat16 pow(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return paddle::platform::bfloat16(
+      ::powf(static_cast<float>(a), static_cast<float>(b)));
 }
 
 template <>
-HOSTDEVICE inline bfloat16 abs(const bfloat16& a) {
-  return bfloat16(::fabs(static_cast<float>(a)));
+HOSTDEVICE inline paddle::platform::bfloat16 abs(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::fabs(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 mini(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return b < a ? b : a;
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 maxi(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return a < b ? b : a;
 }
 
 //////////// complex64 methods /////////////
@@ -398,5 +421,15 @@ HOSTDEVICE inline float16 abs(const float16& a) {
   return float16(::fabs(static_cast<float>(a)));
 }
 
+template <>
+HOSTDEVICE inline float16 mini(const float16& a, const float16& b) {
+  return b < a ? b : a;
+}
+
+template <>
+HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
+  return a < b ? b : a;
+}
+
 }  // namespace numext
 }  // namespace Eigen
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index 6f435bb86ba5acedaf53d25207ff68d1d60c82e8..c5e2cff5b4f7d157102ad42f9b5142ff628df5d3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -261,4 +261,4 @@ class PipelineOptimizer(MetaOptimizerBase):
                             'ring_id': ring_id,
                             'use_calc_stream': True,
                             OP_ROLE_KEY: OpRole.Optimize
-                        })
+                        })
\ No newline at end of file
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index f4620ff00013c85084484cb330f685a1b83c4cfc..21b40454408ef931a158db24a38c7e8d0debb13b 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -55,7 +55,6 @@ class ImperativeQuantAware(object):
                  act_quantize_layer=None):
         """
         The constructor for ImperativeQuantAware.
-
         Args:
             quantizable_layer_type(list[str | layer]): List the type of
                 layers that will be quantized. Default is ['Conv2D', 'Linear'].
@@ -103,16 +102,13 @@ class ImperativeQuantAware(object):
                 activation and returns dequantized activation. 
                 If None, will use quantization op defined by 'activation_quantize_type'.
                 Default is None.
-
         Note:
             If user sets attribute 'skip_quant' to a Layer that support dynamic
             quantization and sets it to true, the layer would not be quantized
             during training. If this attribute is not sets or the attribute is
             false, the Layer would be qunatized in training.
-
         Examples 1:
         .. code-block:: python
-
             import paddle
             from paddle.fluid.contrib.slim.quantization \
                 import ImperativeQuantAware
@@ -120,7 +116,6 @@ class ImperativeQuantAware(object):
                 import resnet
             
             model = resnet.resnet50(pretrained=True)
-
             imperative_qat = ImperativeQuantAware(
                 weight_quantize_type='abs_max',
                 activation_quantize_type='moving_average_abs_max')
@@ -129,7 +124,6 @@ class ImperativeQuantAware(object):
             # The original model will be rewrite.
             # The outscale of outputs in supportted layers would be calculated.
             imperative_qat.quantize(model)
-
             # Fine-tune the quantized model
             # ...
             
@@ -140,45 +134,36 @@ class ImperativeQuantAware(object):
                 input_spec=[
                     paddle.static.InputSpec(
                     shape=[None, 3, 224, 224], dtype='float32')])
-
         Examples 2:
         .. code-block:: python
-
             import paddle
             from paddle.fluid.contrib.slim.quantization \
                 import ImperativeQuantAware
-
             class ImperativeModel(paddle.nn.Layer):
                 def __init__(self):
                     super(ImperativeModel, self).__init__()
                     # self.linear_0 would skip the quantization.
                     self.linear_0 = paddle.nn.Linear(784, 400)
                     self.linear_0.skip_quant = True
-
                     # self.linear_1 would not skip the quantization.
                     self.linear_1 = paddle.nn.Linear(400, 10)
                     self.linear_1.skip_quant = False
-
                 def forward(self, inputs):
                     x = self.linear_0(inputs)
                     x = self.linear_1(inputs)
                     return x
-
             model = ImperativeModel()
             imperative_qat = ImperativeQuantAware(
                 weight_quantize_type='abs_max',
                 activation_quantize_type='moving_average_abs_max')
-
             # Add the fake quant logical.
             # The original model will be rewrite.
             #
             # There is only one Layer(self.linear1) would be added the
             # fake quant logical.
             imperative_qat.quantize(model)
-
             # Fine-tune the quantized model
             # ...
-
             # Save quant model for the inference.
             imperative_qat.save_quantized_model(
                 layer=model,
@@ -210,7 +195,6 @@ class ImperativeQuantAware(object):
         fake_quantize_dequantize_moving_average_abs_max,
         fake_quantize_dequantize_abs_max and so on. At the same time,
         the out_scale value of outputs would be calculated.
-
         Args:
             model(fluid.dygraph.Layer): the model to be quantized.
         Returns:
@@ -245,7 +229,6 @@ class ImperativeQuantizeInputs(object):
                  act_quantize_layer=None):
         """
         The constructor for ImperativeQuantizeInputs. 
-
         Please refer to the args of ImperativeQuantAware.
         """
         super(ImperativeQuantizeInputs, self).__init__()
@@ -351,7 +334,6 @@ class ImperativeQuantizeOutputs(object):
     def __init__(self, moving_rate=0.9):
         """
         The constructor for ImperativeQuantizeOutputs.
-
         Args:
             moving_rate(float): The decay coefficient of moving average.
                                 The default value is 0.9.
@@ -363,11 +345,9 @@ class ImperativeQuantizeOutputs(object):
         """
         Insert the `moving_average_abs_max_scale` layers to calculate the
         output scales for specific layers in the dygraph model.
-
         Args:
             model(fluid.dygraph.Layer): The target model which would be
                 calculate the output quantization scale.
-
         Returns:
             None
         """
@@ -397,7 +377,6 @@ class ImperativeQuantizeOutputs(object):
     def save_quantized_model(self, layer, path, input_spec=None, **config):
         """
         Save the quantized model for the inference.
-
         Args:
             layer (Layer): The Layer to be saved.
             path (str): The path prefix to save model. The format is 
@@ -418,7 +397,6 @@ class ImperativeQuantizeOutputs(object):
                 If the provided ``output_spec`` list is not all output variables, 
                 the saved model will be pruned according to the given
                 ``output_spec`` list. 
-
         Returns:
             None
         """
@@ -514,4 +492,4 @@ class ImperativeQuantizeOutputs(object):
         previous_ops = [utils.find_previous_op(block, arg_name) \
             for arg_name in in_op.input_arg_names]
         return any(op is not None and op.type not in \
-            utils.fake_quantize_dequantize_types for op in previous_ops)
+            utils.fake_quantize_dequantize_types for op in previous_ops)
\ No newline at end of file
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index f45eb8c97f419e4cfb331ca99ea30c27514b19cd..c9c4ae9849eed93206557a00ae60361cab234c81 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -99,4 +99,4 @@ def find_next_ops(block, var_name):
     for op in block.ops:
         if var_name in op.input_arg_names:
             res_ops.append(op)
-    return res_ops
+    return res_ops
\ No newline at end of file
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index 8d6ce76ef0fa5f3d1b1e9400c705ffc625fcf9bb..b38806f85679ca2ec45cfdb33357d9b3bfca4431 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -479,4 +479,4 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()
\ No newline at end of file
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 2aa918bf806616a55c5e72e0664b6dd2458199ef..9871cbd36bd7cc3b1fabb065d8a28f406e12e9a8 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -57,7 +57,6 @@ __all__ = [
 
 class Optimizer(object):
     """Optimizer Base class.
-
     Define the common interface of an optimizer.
     User should not use this class directly,
     but need to use one of it's implementation.
@@ -135,22 +134,17 @@ class Optimizer(object):
         '''
         Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict.
         If the optimizer never be called(minimize function), the state_dict is empty.
-
         Args: None
         Return:
             state_dict(dict) : dict contains all the variable used by optimizer
         
         Examples:
             .. code-block:: python
-
                 import paddle.fluid as fluid
-
                 with fluid.dygraph.guard():
                     emb = fluid.dygraph.Embedding([10, 10])
-
                     adam = fluid.optimizer.Adam(0.001, parameter_list=emb.parameters())
                     state_dict = adam.state_dict()
-
         '''
         from paddle.optimizer.lr import LRScheduler
         state_dict = {}
@@ -179,7 +173,6 @@ class Optimizer(object):
     def set_state_dict(self, state_dict):
         '''
         Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
-
         Args: 
             state_dict(dict) : Dict contains all the Variable needed by optimizer
         Return:
@@ -187,17 +180,12 @@ class Optimizer(object):
         
         Examples:
             .. code-block:: python
-
                 import paddle
                 import paddle.fluid as fluid
-
                 paddle.disable_static()
-
                 emb = paddle.nn.Embedding(10, 10)
-
                 state_dict = emb.state_dict()
                 fluid.save_dygraph(state_dict, "paddle_dy")
-
                 scheduler = paddle.optimizer.lr.NoamDecay(	
                     d_model=0.01, warmup_steps=100, verbose=True)
                 adam = paddle.optimizer.Adam(
@@ -205,7 +193,6 @@ class Optimizer(object):
                     parameters=emb.parameters())
                 state_dict = adam.state_dict()
                 fluid.save_dygraph(state_dict, "paddle_dy")
-
                 para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
         '''
         from paddle.optimizer.lr import LRScheduler
@@ -349,23 +336,18 @@ class Optimizer(object):
         
         Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay,
         this API cannot be invoked, because it will lead to conflict.
-
         Args:
             value (float|Variable): the value of learning rate
-
         Returns:
             None
           
         Examples:
             .. code-block:: python
-
                 import paddle.fluid as fluid
                         
                 with fluid.dygraph.guard():
                     linear = fluid.dygraph.nn.Linear(10, 10)
-
                     adam = fluid.optimizer.Adam(0.1, parameter_list=linear.parameters())
-
                     # set learning rate manually by python float value
                     lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
                     for i in range(5):
@@ -378,8 +360,6 @@ class Optimizer(object):
                     #    current lr is 0.4
                     #    current lr is 0.5
                     #    current lr is 0.6
-
-
                     # set learning rate manually by framework Variable
                     lr_var = fluid.layers.create_global_var(
                         shape=[1], value=0.7, dtype='float32')
@@ -388,9 +368,6 @@ class Optimizer(object):
                     print("current lr is {}".format(lr))
                     # Print:
                     #    current lr is 0.7
-
-
-
         """
         if not isinstance(value, (framework.Variable, float)):
             raise TypeError(
@@ -426,23 +403,18 @@ class Optimizer(object):
         
         Get current step learning rate. The return value is all the same When LearningRateDecay is not used,
         otherwise return the step learning rate.
-
         Returns:
             float: The learning rate of the current step.
-
         Examples:
             .. code-block:: python
-
                 import paddle.fluid as fluid
                 import numpy as np
-
                 # example1: LearningRateDecay is not used, return value is all the same
                 with fluid.dygraph.guard():
                     emb = fluid.dygraph.Embedding([10, 10])
                     adam = fluid.optimizer.Adam(0.001, parameter_list = emb.parameters())
                     lr = adam.current_step_lr()
                     print(lr) # 0.001
-
                 # example2: PiecewiseDecay is used, return the step learning rate
                 with fluid.dygraph.guard():
                     inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
@@ -455,17 +427,14 @@ class Optimizer(object):
                     value = [0.2, 0.4, 0.6, 0.8, 1.0]
                     adam = fluid.optimizer.Adam(fluid.dygraph.PiecewiseDecay(bd, value, 0),
                                            parameter_list=linear.parameters())
-
                     # first step: learning rate is 0.2
                     np.allclose(adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0) # True
-
                     # learning rate for different steps
                     ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
                     for i in range(12):
                         adam.minimize(loss)
                         lr = adam.current_step_lr()
                         np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True
-
         """
         current_lr = self._global_learning_rate()
         if isinstance(current_lr, framework.Variable):
@@ -514,7 +483,6 @@ class Optimizer(object):
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
-
         Args:
             block: the block in which the loss variable is present
             parameters: list of parameter variables for the optimizer
@@ -524,11 +492,9 @@ class Optimizer(object):
     def _finish_update(self, block, parameters_and_grads):
         """Finish any custom updates needed
            before completing an optimization step
-
         Args:
             block: the block in which the loss variable is present
             parameters: list of parameter variables for the optimizer
-
         Returns:
             None
         """
@@ -543,7 +509,6 @@ class Optimizer(object):
                          type=None,
                          device=None):
         """Utility function to add an accumulator for a parameter
-
         Args:
             block: the block in which the loss variable is present
             name: name of the accumulator
@@ -591,11 +556,9 @@ class Optimizer(object):
 
     def _get_accumulator(self, name, param):
         """Utility function to fetch an accumulator for a parameter
-
         Args:
             name: name of the accumulator
             param: parameter variable for which accumulator is to be fetched
-
         Returns:
             accumulator variable for the parameter
         """
@@ -629,11 +592,9 @@ class Optimizer(object):
 
     def _create_optimization_pass(self, parameters_and_grads):
         """Add optimization operators to update gradients to variables.
-
         Args:
           parameters_and_grads(list(tuple(Variable, Variable))):
             a list of (variable, gradient) pair to update.
-
         Returns:
           return_op_list: a list of operators that will complete one step of
             optimization. This will include parameter update ops, global step
@@ -745,7 +706,6 @@ class Optimizer(object):
         """
         The first part of ``minimize``, do auto-diff to append backward operations for
         the current program.
-
         Args:
             loss (Variable): ``loss`` variable to run optimizations.
             startup_program (Program, optional): :ref:`api_fluid_Program` for
@@ -758,11 +718,9 @@ class Optimizer(object):
                 to be updated. The default value is None.
             callbacks (list, optional): list of callable objects to run when appending backward
                 operator for one parameter. The default value is None.
-
         Return:
             list: list of (param, grad) variable pairs, param is ``Parameter``,
                 grad is the gradient value corresponding to the parameter.
-
         Examples:
             See examples in ``apply_gradients``.
         """
@@ -809,16 +767,12 @@ class Optimizer(object):
         """
         Second part of `minimize`, appending optimization operators for
         given `params_grads` pairs.
-
         Args:
             params_grads (list): list of (param, grad) pair to do optimization.
-
         Returns:
             list: A list of operators appended to the current program.
-
         Examples:
             .. code-block:: python
-
                 import paddle.fluid as fluid
                 loss = network()
                 optimizer = fluid.optimizer.SGD(learning_rate=0.1)
@@ -883,7 +837,6 @@ class Optimizer(object):
     def clear_gradients(self):
         """
         Clear the gradients of all optimized parameters for model.
-
         If not, new gradient will accumulat on previous gradient.
         
         Returns:
@@ -891,10 +844,8 @@ class Optimizer(object):
         
         Examples:
             .. code-block:: python
-
                 import paddle.fluid as fluid
                 import numpy as np
-
                 with fluid.dygraph.guard():
                     value = np.arange(26).reshape(2, 13).astype("float32")
                     a = fluid.dygraph.to_variable(value)
@@ -906,7 +857,6 @@ class Optimizer(object):
                     out.backward()
                     adam.minimize(out)
                     adam.clear_gradients()
-
         """
         for p in self._parameter_list:
             if p.trainable:
@@ -920,7 +870,6 @@ class Optimizer(object):
                  no_grad_set=None):
         """
         Add operations to minimize ``loss`` by updating ``parameter_list``.
-
         Args:
             loss (Variable): A ``Variable`` containing the value to minimize.
             startup_program (Program, optional): :ref:`api_fluid_Program` for
@@ -931,7 +880,6 @@ class Optimizer(object):
                 will be updated.
             no_grad_set (set, optional): Set of ``Variable``  or ``Variable.name`` that don't need
                 to be updated. The default value is None.
-
         Returns:
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) variable pairs, param is
@@ -939,7 +887,6 @@ class Optimizer(object):
             The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
             indicate program pruning. If so, the program will be pruned by ``feed`` and 
             ``fetch_list`` before run, see details in ``Executor``.
-
         Examples:
             Please refer to the example of current Optimizer.
         """
@@ -963,11 +910,8 @@ class Optimizer(object):
 class SGDOptimizer(Optimizer):
     r"""
     Optimizer of the stochastic gradient descent algorithm.
-
     .. math::
-
         param\_out = param - learning\_rate * grad
-
     Parameters:
         learning_rate (float|Variable): The learning rate used to update parameters. \
             Can be a float value or a Variable with one float value as data element.
@@ -985,14 +929,11 @@ class SGDOptimizer(Optimizer):
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
-
     Examples:
         .. code-block:: python
-
             import paddle
             import paddle.fluid as fluid
             import numpy as np
-
             place = fluid.CPUPlace()
             main = fluid.Program()
             with fluid.program_guard(main):
@@ -1001,10 +942,8 @@ class SGDOptimizer(Optimizer):
                 y_predict = fluid.layers.fc(input=x, size=1, act=None)
                 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                 avg_cost = fluid.layers.mean(cost)
-
                 sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
                 sgd_optimizer.minimize(avg_cost)
-
                 fetch_list = [avg_cost]
                 train_reader = paddle.batch(
                     paddle.dataset.uci_housing.train(), batch_size=1)
@@ -1013,7 +952,6 @@ class SGDOptimizer(Optimizer):
                 exe.run(fluid.default_startup_program())
                 for data in train_reader():
                     exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
     """
 
     def __init__(self,
@@ -1056,25 +994,15 @@ class SGDOptimizer(Optimizer):
 
 class MomentumOptimizer(Optimizer):
     r"""
-
     Simple Momentum optimizer with velocity state
-
     This optimizer has a flag for Nestrov Momentum.
-
     The update equations are as follows:
-
     .. math::
-
         & velocity = mu * velocity + gradient
-
         & if (use\_nesterov):
-
         &\quad   param = param - (gradient + mu * velocity) * learning\_rate
-
         & else:
-
         &\quad   param = param - learning\_rate * velocity
-
     Parameters:
         learning_rate (float|Variable): The learning rate used to update parameters. \
             Can be a float value or a Variable with one float value as data element.
@@ -1094,14 +1022,11 @@ class MomentumOptimizer(Optimizer):
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
-
     Examples:
         .. code-block:: python
-
             import paddle
             import paddle.fluid as fluid
             import numpy as np
-
             place = fluid.CPUPlace()
             main = fluid.Program()
             with fluid.program_guard(main):
@@ -1110,10 +1035,8 @@ class MomentumOptimizer(Optimizer):
                 y_predict = fluid.layers.fc(input=x, size=1, act=None)
                 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                 avg_cost = fluid.layers.mean(cost)
-
                 moment_optimizer = fluid.optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
                 moment_optimizer.minimize(avg_cost)
-
                 fetch_list = [avg_cost]
                 train_reader = paddle.batch(
                     paddle.dataset.uci_housing.train(), batch_size=1)
@@ -1122,7 +1045,6 @@ class MomentumOptimizer(Optimizer):
                 exe.run(fluid.default_startup_program())
                 for data in train_reader():
                     exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
     """
     _velocity_acc_str = "velocity"
 
@@ -1192,29 +1114,18 @@ class MomentumOptimizer(Optimizer):
 class DGCMomentumOptimizer(Optimizer):
     r"""
 	:api_attr: Static Graph
-
     DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887
-
     DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\
         only gradients larger than a threshold are transmitted.
-
     To avoid losing information, DGC accumulates the rest of the gradients locally.
-
     Eventually, these gradients become large enough to be transmitted.
-
     Thus, DGC sends the large gradients immediately but eventually sends all of the gradients over time.
-
     To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance.
-
     DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
-
     This optimizer will do two things:
-
         1. Compress the gradient by get TopK import value from tensor \
             and use it for allreduce to reduce network bandwidth.
-
         2. Call momentum to optimize the cost.
-
     Args:
         learning_rate (float|Variable): The learning rate used to update parameters. \
             It can be a float value or a Variable with one float value as a data element.
@@ -1241,10 +1152,8 @@ class DGCMomentumOptimizer(Optimizer):
             meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
-
     Examples:
         .. code-block:: python
-
             import paddle.fluid as fluid
             optimizer = fluid.optimizer.DGCMomentumOptimizer(
                         learning_rate=0.0001,
@@ -1252,7 +1161,6 @@ class DGCMomentumOptimizer(Optimizer):
                         rampup_step=1000,
                         rampup_begin_step=1252,
                         sparsity=[0.999, 0.999])
-
     """
     _u_velocity_acc_str = "_dgc_u_"
     _v_velocity_acc_str = "_dgc_v_"
@@ -1612,18 +1520,12 @@ class DGCMomentumOptimizer(Optimizer):
 class LarsMomentumOptimizer(Optimizer):
     r"""
     Momentum optimizer with LARS support
-
     The update equations are as follows:
-
     .. math::
-
         & local\_learning\_rate = learning\_rate * lars\_coeff * \\
           \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
-
         & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)
-
         & param = param - velocity
-
     Parameters:
         learning_rate (float|Variable): The learning rate used to update parameters. \
             Can be a float value or a Variable with one float value as data element. \
@@ -1649,10 +1551,8 @@ class LarsMomentumOptimizer(Optimizer):
         
     Examples:
         .. code-block:: python
-
             import paddle.fluid as fluid
             import numpy as np
-
             np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
             inp = fluid.layers.data(
                 name="inp", shape=[2, 2], append_batch_size=False)
@@ -1660,7 +1560,6 @@ class LarsMomentumOptimizer(Optimizer):
             out = fluid.layers.reduce_sum(out)
             optimizer = fluid.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
             optimizer.minimize(out)
-
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(fluid.default_startup_program())
             exe.run(
@@ -1745,23 +1644,16 @@ class AdagradOptimizer(Optimizer):
     r"""
     The Adaptive Gradient optimizer (Adagrad for short) can adaptively assign
     different learning rates to individual parameters.
-
     The parameter ``param_out`` update rule with gradient ``grad``:
-
     .. math::
-
         moment\_out &= moment + grad * grad
-
         param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
-
     Related paper: `Adaptive Subgradient Methods for Online Learning and
     Stochastic Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.
-
     The original paper does not have the ``epsilon`` attribute. It is added here
     in our implementation as also proposed `Per-parameter adaptive learning rate
     methods <http://cs231n.github.io/neural-networks-3/#ada>`_
     for numerical stability to avoid the division by zero error.
-
     Args:
         learning_rate (float|Variable): The learning rate used to update ``Parameter``.
             It can be a float value or a ``Variable`` with a float type.
@@ -1784,20 +1676,16 @@ class AdagradOptimizer(Optimizer):
             The default value is None.
         initial_accumulator_value (float, optional): Initial value for moment accumulator.
             The default value is 0.0.
-
     Examples:
         .. code-block:: python
-
             import numpy as np
             import paddle.fluid as fluid
-
             np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
             inp = fluid.data(name="inp", shape=[2, 2])
             out = fluid.layers.fc(inp, size=3)
             out = fluid.layers.reduce_sum(out)
             optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.2)
             optimizer.minimize(out)
-
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(fluid.default_startup_program())
             exe.run(
@@ -1865,22 +1753,14 @@ class AdamOptimizer(Optimizer):
     the 1st moment estimates and the 2nd moment estimates of the gradient.
     
     The parameter ``param_out`` update rule with gradient ``grad``:
-
     .. math::
-
         t & = t + 1
-
         moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
-
         moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
-
         learning\_rate & = learning\_rate * \\
                           \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
-
         param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-
     Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
-
     Args:
         learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``.
             It can be a float value or a ``Variable`` with a float type. The default value is 0.001.
@@ -1914,13 +1794,10 @@ class AdamOptimizer(Optimizer):
             gradient in current mini-batch, so it will be much more faster. But this mode has
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
-
     Examples:
         .. code-block:: python
-
             import paddle
             import paddle.fluid as fluid
-
             place = fluid.CPUPlace()
             main = fluid.Program()
             with fluid.program_guard(main):
@@ -1929,10 +1806,8 @@ class AdamOptimizer(Optimizer):
                 y_predict = fluid.layers.fc(input=x, size=1, act=None)
                 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                 avg_cost = fluid.layers.mean(cost)
-
                 adam_optimizer = fluid.optimizer.AdamOptimizer(0.01)
                 adam_optimizer.minimize(avg_cost)
-
                 fetch_list = [avg_cost]
                 train_reader = paddle.batch(
                     paddle.dataset.uci_housing.train(), batch_size=1)
@@ -1941,14 +1816,11 @@ class AdamOptimizer(Optimizer):
                 exe.run(fluid.default_startup_program())
                 for data in train_reader():
                     exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
         .. code-block:: python
-
             # Adam with beta1/beta2 as Variable
             import paddle
             import paddle.fluid as fluid
             import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler
-
             place = fluid.CPUPlace()
             main = fluid.Program()
             with fluid.program_guard(main):
@@ -1957,11 +1829,9 @@ class AdamOptimizer(Optimizer):
                 y_predict = fluid.layers.fc(input=x, size=1, act=None)
                 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                 avg_cost = fluid.layers.mean(cost)
-
                 # define beta decay variable
                 def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate):
                     global_step = lr_scheduler._decay_step_counter()
-
                     beta1 = fluid.layers.create_global_var(
                         shape=[1],
                         value=float(beta1_init),
@@ -1976,22 +1846,18 @@ class AdamOptimizer(Optimizer):
                         # set persistable for save checkpoints and resume
                         persistable=True,
                         name="beta2")
-
                     div_res = global_step / decay_steps
                     decayed_beta1 = beta1_init * (decay_rate**div_res)
                     decayed_beta2 = beta2_init * (decay_rate**div_res)
                     fluid.layers.assign(decayed_beta1, beta1)
                     fluid.layers.assign(decayed_beta2, beta2)
-
                     return beta1, beta2
-
                 beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9)
                 adam_optimizer = fluid.optimizer.AdamOptimizer(
                                                     learning_rate=0.01,
                                                     beta1=beta1,
                                                     beta2=beta2)
                 adam_optimizer.minimize(avg_cost)
-
                 fetch_list = [avg_cost]
                 train_reader = paddle.batch(
                     paddle.dataset.uci_housing.train(), batch_size=1)
@@ -2129,26 +1995,16 @@ class AdamaxOptimizer(Optimizer):
     in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
     The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
     which makes the learning rate update algorithm more stable and simple.
-
     The parameter ``param_out`` update rule with gradient ``grad``:
-
     .. math::
-
         t & = t + 1
-
         moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
-
         inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
-
         learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
-
         param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
-
     Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
-
     The original paper does not have an ``epsilon`` attribute,
     it is added here for numerical stability to prevent the division by 0 error.
-
     Args:
         learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``.
             It can be a float value or a ``Variable`` with a float type. The default value is 0.001.
@@ -2173,20 +2029,15 @@ class AdamaxOptimizer(Optimizer):
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
-
     **Notes**:
         **Currently, AdamaxOptimizer doesn't support sparse parameter optimization.**
-
     Examples:
         .. code-block:: python
-
           import paddle.fluid as fluid
           import numpy
-
           # First create the Executor.
           place = fluid.CPUPlace() # fluid.CUDAPlace(0)
           exe = fluid.Executor(place)
-
           train_program = fluid.Program()
           startup_program = fluid.Program()
           with fluid.program_guard(train_program, startup_program):
@@ -2195,10 +2046,8 @@ class AdamaxOptimizer(Optimizer):
               loss = fluid.layers.mean(hidden)
               adam = fluid.optimizer.AdamaxOptimizer(learning_rate=0.2)
               adam.minimize(loss)
-
           # Run the startup program once and only once.
           exe.run(startup_program)
-
           x = numpy.random.random(size=(10, 1)).astype('float32')
           outs = exe.run(program=train_program,
                         feed={'X': x},
@@ -2299,17 +2148,13 @@ class DpsgdOptimizer(Optimizer):
     r"""
     We implement the Dpsgd optimizer according to CCS16 paper -
     Deep Learning with Differential Privacy.
-
     Examples:
         .. code-block:: python
-
           import paddle.fluid as fluid
           import numpy
-
           # First create the Executor.
           place = fluid.CPUPlace() # fluid.CUDAPlace(0)
           exe = fluid.Executor(place)
-
           train_program = fluid.Program()
           startup_program = fluid.Program()
           with fluid.program_guard(train_program, startup_program):
@@ -2318,15 +2163,12 @@ class DpsgdOptimizer(Optimizer):
               loss = fluid.layers.mean(hidden)
               optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
               optimizer.minimize(loss)
-
           # Run the startup program once and only once.
           exe.run(startup_program)
-
           x = numpy.random.random(size=(10, 1)).astype('float32')
           outs = exe.run(program=train_program,
                         feed={'X': x},
                          fetch_list=[loss.name])
-
     Args:
         learning_rate (float|Variable): the learning rate used to update parameters. \
         Can be a float value or a Variable with one float value as data element.
@@ -2395,21 +2237,14 @@ class DecayedAdagradOptimizer(Optimizer):
     The Decayed Adagrad optimizer can be seen as an Adagrad algorithm that introduces
     the decay rate to solve the problem of a sharp drop in the learning rate
     during model training when using the AdagradOptimizer.
-
     The parameter ``param_out`` update rule with gradient ``grad``:
-
     .. math::
-
         moment\_out & = decay * moment + (1 - decay) * grad * grad
-
         param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
-
     Related paper: `Adaptive Subgradient Methods for Online Learning and Stochastic
     Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.
-
     The original paper does not have an ``epsilon`` attribute. It is added here for numerical
     stability to avoid the division by zero error.
-
     Args:
         learning_rate (float|Variable): The learning rate used to update ``Parameter``.
             It can be a float value or a ``Variable`` with a float type.
@@ -2431,15 +2266,11 @@ class DecayedAdagradOptimizer(Optimizer):
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
-
     **Notes**:
         **Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.**
-
     Examples:
         .. code-block:: python
-
             import paddle.fluid as fluid
-
             x = fluid.data( name='x', shape=[None, 10], dtype='float32' )
             trans = fluid.layers.fc( x, 100 )
             cost = fluid.layers.reduce_mean( trans )
@@ -2503,20 +2334,13 @@ class DecayedAdagradOptimizer(Optimizer):
 class AdadeltaOptimizer(Optimizer):
     r"""
     **Notes: This API does not support sparse parameter optimization.**
-
     Adadelta Optimizer. Please refer to this for details:
     `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
-
     The update is done as follows:
-
     .. math::
-
         E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2
-
         learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }
-
         E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
-
     Args:
         learning_rate (float|Variable): global learning rate.
         epsilon (float): a small float number for numeric stability. Default 1.0e-6.
@@ -2536,18 +2360,14 @@ class AdadeltaOptimizer(Optimizer):
         name (str, optional): The default value is None. Normally there is no need for user
                 to set this property. For more information, please refer to
                 :ref:`api_guide_Name` .
-
     Examples:
         .. code-block:: python
-
             import paddle.fluid as fluid
-
             image = fluid.data(name='image', shape=[None, 28], dtype='float32')
             fc = fluid.layers.fc(image, size=10)
             cost = fluid.layers.reduce_mean(fc)
             optimizer = fluid.optimizer.Adadelta(
                 learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
-
             # optimizer_ops is a list of optimizer operators to update parameters
             # params_grads is a list of (param, param_grad), where param is each
             # parameter and param_grad is the gradient variable of param.
@@ -2624,49 +2444,30 @@ class RMSPropOptimizer(Optimizer):
     Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
     rate method. The original slides proposed RMSProp: Slide 29 of
     http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
-
     The original equation is as follows:
-
     ..  math::
-
         r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
-
         w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
-
     The first equation calculates moving average of the squared gradient for
     each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
-
     In some cases, adding a momentum term :math: `\\beta` is beneficial.
     In our implementation, Nesterov momentum is used:
-
     ..  math::
-
         r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
-
         v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
             \\epsilon}} \\nabla Q_{i}(w)
-
         w & = w - v(w, t)
-
     if centered is True:
-
     ..  math::
-
         r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
-
         g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
-
         v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
             \\epsilon}} \\nabla Q_{i}(w)
-
         w & = w - v(w, t)
-
     where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
     and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
     smoothing term to avoid division by zero, usually set somewhere in range
     from 1e-4 to 1e-8.
-
-
     Parameters:
         learning_rate(float): Global learning rate.
         rho(float): rho is :math: `\\rho` in equation, default is 0.95.
@@ -2692,17 +2493,13 @@ class RMSPropOptimizer(Optimizer):
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
-
     Raises:
         ValueError: If learning_rate, rho, epsilon, momentum are None.
-
     Examples:
           .. code-block:: python
-
             import paddle
             import paddle.fluid as fluid
             import numpy as np
-
             place = fluid.CPUPlace()
             main = fluid.Program()
             with fluid.program_guard(main):
@@ -2711,10 +2508,8 @@ class RMSPropOptimizer(Optimizer):
                 y_predict = fluid.layers.fc(input=x, size=1, act=None)
                 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                 avg_cost = fluid.layers.mean(cost)
-
                 rms_optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
                 rms_optimizer.minimize(avg_cost)
-
                 fetch_list = [avg_cost]
                 train_reader = paddle.batch(
                     paddle.dataset.uci_housing.train(), batch_size=1)
@@ -2723,7 +2518,6 @@ class RMSPropOptimizer(Optimizer):
                 exe.run(fluid.default_startup_program())
                 for data in train_reader():
                     exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
     """
 
     _momentum_acc_str = "momentum"
@@ -2810,43 +2604,24 @@ class RMSPropOptimizer(Optimizer):
 class FtrlOptimizer(Optimizer):
     r"""
     FTRL (Follow The Regularized Leader) Optimizer.
-
     The paper that proposed Follow The Regularized Leader (FTRL):
     (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
-
     ..  math::
-
         &new\_accum = squared\_accum + grad^2
-
         &if (lr\_power == -0.5):
-
         &\quad  linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param}
-
         &else:
-
         &\quad   linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}
-
-
         &x = l1 * sign(linear\_accum) - linear\_accum
-
         &if (lr\_power == -0.5):
-
         &\quad   y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2)
-
         &\quad   pre\_shrink = \\frac{x}{y}
-
         &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
-
         &else:
-
         &\quad   y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)
-
         &\quad   pre\_shrink = \\frac{x}{y}
-
         &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
-
         &squared\_accum += grad^2
-
     Parameters:
         learning_rate (float|Variable): Global learning rate.
         l1 (float): L1 regularization strength, default is 0.0.
@@ -2866,17 +2641,13 @@ class FtrlOptimizer(Optimizer):
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
-
     Raises:
         ValueError: If learning_rate, rho, epsilon, momentum are None.
-
     Examples:
           .. code-block:: python
-
             import paddle
             import paddle.fluid as fluid
             import numpy as np
-
             place = fluid.CPUPlace()
             main = fluid.Program()
             with fluid.program_guard(main):
@@ -2885,10 +2656,8 @@ class FtrlOptimizer(Optimizer):
                 y_predict = fluid.layers.fc(input=x, size=1, act=None)
                 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
                 avg_cost = fluid.layers.mean(cost)
-
                 ftrl_optimizer = fluid.optimizer.Ftrl(learning_rate=0.1)
                 ftrl_optimizer.minimize(avg_cost)
-
                 fetch_list = [avg_cost]
                 train_reader = paddle.batch(
                     paddle.dataset.uci_housing.train(), batch_size=1)
@@ -2897,7 +2666,6 @@ class FtrlOptimizer(Optimizer):
                 exe.run(fluid.default_startup_program())
                 for data in train_reader():
                     exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
     NOTE:
        Currently, FtrlOptimizer doesn't support sparse parameter optimization.
     """
@@ -2969,32 +2737,20 @@ class FtrlOptimizer(Optimizer):
 class LambOptimizer(AdamOptimizer):
     r"""
     LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
-
     LAMB Optimizer is designed to scale up the batch size of training without losing 
     accuracy, which supports adaptive element-wise updating and accurate layer-wise 
     correction. For more information, please refer to `Large Batch Optimization for 
     Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .
-
     The updating of parameters follows:
-
     ..  math::
-
         m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t 
-
         v_t &= \\beta_2 v_{t - 1}  + (1 - \\beta_2)g_t^2
-
         m_t &= \\frac{m_t}{\\beta_1^t}
-
         v_t &= \\frac{v_t}{\\beta_2^t}
-
         r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon}
-
         w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1})
-
-
     where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the 
     learning rate, :math:`\\lambda` the LAMB weight decay rate.
-
     Args:
         learning_rate (float|Variable, optional): the learning rate used to update parameters. \
             Can be a float value or a Variable with data type float32. Default 0.001.
@@ -3022,19 +2778,15 @@ class LambOptimizer(AdamOptimizer):
             Default None.
         name(str|None): For detailed information, please refer to 
             :ref:`api_guide_Name` . Usually name is no need to set and None by default.
-
     Examples:
         .. code-block:: python
             
             import paddle.fluid as fluid 
-
             data = fluid.data(name='x', shape=[-1, 5], dtype='float32')
             hidden = fluid.layers.fc(input=data, size=10)
             cost = fluid.layers.mean(hidden)
-
             def exclude_fn(param):
                 return param.name.endswith('.b_0')
-
             optimizer = fluid.optimizer.Lamb(learning_rate=0.002,
                                              exclude_from_weight_decay_fn=exclude_fn)
             optimizer.minimize(cost)
@@ -3157,36 +2909,28 @@ Lamb = LambOptimizer
 class ModelAverage(Optimizer):
     r"""
 	:api_attr: Static Graph
-
     The ModelAverage optimizer accumulates specific continuous historical parameters
     during training. The accumulated historical range can be controlled by the passed
     ``average_window_rate`` argument. The averaged ``Parameter`` are used in the prediction,
     which usually can improve the accuracy of the prediction.
-
     Accumulate the average of the ``Parameter`` in the sliding window, the result will be saved
     in a temporary variable, can be applied to the current model's ``Parameter`` by calling
     the ``apply()`` method, and the current model ``Parameter`` can be restored by calling
     the ``restore()`` method.
-
     The window size for calculating the average is determined by ``average_window_rate``,
     ``min_average_window``, ``max_average_window`` and the current ``Parameter`` update times (num_updates).
-
     When the cumulative times (num_accumulates) is greater than the specific window
     threshold (average_window), the accumulated ``Parameter`` temporary variable is set to 0.0.
     The following example will help to understand the role of these arguments:
-
     ::
-
         if num_accumulates >= min_average_window and num_accumulates >= min(max_average_window, num_updates * average_window_rate):
             num_accumulates = 0
-
     In the above conditional judgment statement, ``num_accumulates`` indicates the current
     accumulated number, which can be abstractly understood as the length of the cumulative window.
     The length of the window must be at least the length set by the ``min_average_window`` argument,
     and cannot exceed the length specified by the ``max_average_window`` argument or
     ``num_updates * average_window_rate``, where ``num_updates`` indicates the current ``Parameter``
     update times, ``average_window_rate`` is a coefficient that calculates the length of the window.
-
     Args:
         average_window_rate (float): The calculate ratio of the window length relative to ``Parameter`` update times.
         min_average_window (int, optional): the minimum size of average window length. The default value is 10000.
@@ -3199,18 +2943,13 @@ class ModelAverage(Optimizer):
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
-
     Examples:
-
       .. code-block:: python
-
         import paddle.fluid as fluid
         import numpy
-
         # First create the Executor.
         place = fluid.CPUPlace()  # fluid.CUDAPlace(0)
         exe = fluid.Executor(place)
-
         train_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
@@ -3220,19 +2959,16 @@ class ModelAverage(Optimizer):
             loss = fluid.layers.mean(hidden)
             optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
             optimizer.minimize(loss)
-
             # build ModelAverage optimizer
             model_average = fluid.optimizer.ModelAverage(0.15,
                                                          min_average_window=10000,
                                                          max_average_window=12500)
-
             exe.run(startup_program)
             for i in range(12500):
                 x = numpy.random.random(size=(10, 1)).astype('float32')
                 outs = exe.run(program=train_program,
                                feed={'X': x},
                                fetch_list=[loss.name])
-
             # apply ModelAverage
             with model_average.apply(exe):
                 x = numpy.random.random(size=(10, 1)).astype('float32')
@@ -3356,24 +3092,18 @@ class ModelAverage(Optimizer):
     def apply(self, executor, need_restore=True):
         """
         Apply the average of the cumulative ``Parameter`` to the parameters of the current model.
-
         Args:
             executor(fluid.Executor): The current network executor.
             need_restore(bool): Restore flag variable, if set to True, the network will restore
                 the parameters of the network to the default value, if set to False,
                 it will not be restored. The default value is True.
-
         Examples:
-
           .. code-block:: python
-
             import paddle.fluid as fluid
             import numpy
-
             # First create the Executor.
             place = fluid.CPUPlace()  # fluid.CUDAPlace(0)
             exe = fluid.Executor(place)
-
             train_program = fluid.Program()
             startup_program = fluid.Program()
             with fluid.program_guard(train_program, startup_program):
@@ -3383,19 +3113,16 @@ class ModelAverage(Optimizer):
                 loss = fluid.layers.mean(hidden)
                 optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
                 optimizer.minimize(loss)
-
                 # build ModelAverage optimizer
                 model_average = fluid.optimizer.ModelAverage(0.15,
                                                             min_average_window=10000,
                                                             max_average_window=12500)
-
                 exe.run(startup_program)
                 for i in range(12500):
                     x = numpy.random.random(size=(10, 1)).astype('float32')
                     outs = exe.run(program=train_program,
                                 feed={'X': x},
                                 fetch_list=[loss.name])
-
                 # apply ModelAverage
                 with model_average.apply(exe):
                     x = numpy.random.random(size=(10, 1)).astype('float32')
@@ -3416,18 +3143,13 @@ class ModelAverage(Optimizer):
         
         Args:
             executor(fluid.Executor): The current network executor.
-
         Examples:
-
           .. code-block:: python
-
             import paddle.fluid as fluid
             import numpy
-
             # First create the Executor.
             place = fluid.CPUPlace()  # fluid.CUDAPlace(0)
             exe = fluid.Executor(place)
-
             train_program = fluid.Program()
             startup_program = fluid.Program()
             with fluid.program_guard(train_program, startup_program):
@@ -3437,26 +3159,22 @@ class ModelAverage(Optimizer):
                 loss = fluid.layers.mean(hidden)
                 optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
                 optimizer.minimize(loss)
-
                 # build ModelAverage optimizer
                 model_average = fluid.optimizer.ModelAverage(0.15,
                                                             min_average_window=10000,
                                                             max_average_window=12500)
-
                 exe.run(startup_program)
                 for i in range(12500):
                     x = numpy.random.random(size=(10, 1)).astype('float32')
                     outs = exe.run(program=train_program,
                                 feed={'X': x},
                                 fetch_list=[loss.name])
-
                 # apply ModelAverage
                 with model_average.apply(exe, False):
                     x = numpy.random.random(size=(10, 1)).astype('float32')
                     exe.run(program=train_program,
                             feed={'X': x},
                             fetch_list=[loss.name])
-
                 # restore Parameters
                 model_average.restore(exe)
         """
@@ -3466,31 +3184,23 @@ class ModelAverage(Optimizer):
 class ExponentialMovingAverage(object):
     r"""
 	:api_attr: Static Graph
-
     Compute the moving average of parameters with exponential decay.
     Given a parameter :math:`\\theta`, its exponential moving average (EMA)
     will be
-
     ..  math::
-
         \\text{EMA}_0 & = 0
-
 	\\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t
-
     The average results calculated by **update()** method will be saved in 
     temporary variables which are created and maintained by the object, and can 
     be applied to parameters of current model by calling **apply()** method. And 
     the **restore()** method is used to restore the parameters.
-
     **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be 
     zero biased, which can be corrected by divided by a factor 
     :math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters 
     when calling **apply()** method would be 
-
     ..  math::
     
         \\widehat{\\text{EMA}}_t = \\frac{\\text{EMA}_t}{1 - \\text{decay}^t}
-
     **Decay rate scheduling**. A large decay rate very close to 1 would result 
     in that the averages move very slowly. And a better strategy is to set a 
     relative smaller decay rate in the very beginning. The argument **thres_steps**
@@ -3500,10 +3210,7 @@ class ExponentialMovingAverage(object):
     ..  math::
     
         \\min(\\text{decay}, \\frac{1 + \\text{thres_steps}}{10 + \\text{thres_steps}})
-
     Usually **thres_steps** can be the global training steps.
-
-
     Args:
 	decay (float, optional): The exponential decay rate, usually close to 1, such as 
             0.999, 0.9999, ... . Default 0.999.
@@ -3512,40 +3219,29 @@ class ExponentialMovingAverage(object):
         name (str|None): For detailed information, please refer to 
             :ref:`api_guide_Name`. Usually name is no need to set and None by 
             default.
-
-
     Examples:
-
 	.. code-block:: python
-
 	    import numpy
 	    import paddle
 	    import paddle.fluid as fluid
-
 	    data = fluid.data(name='x', shape=[-1, 5], dtype='float32')
 	    hidden = fluid.layers.fc(input=data, size=10)
 	    cost = fluid.layers.mean(hidden)
-
 	    test_program = fluid.default_main_program().clone(for_test=True)
-
 	    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 	    optimizer.minimize(cost)
-
 	    global_steps = fluid.layers.autoincreased_step_counter()
 	    ema = fluid.optimizer.ExponentialMovingAverage(0.999, thres_steps=global_steps)
 	    ema.update()
-
 	    place = fluid.CPUPlace()
 	    exe = fluid.Executor(place)
 	    exe.run(fluid.default_startup_program())
-
 	    for pass_id in range(3):
 		for batch_id in range(6):
 		    data = numpy.random.random(size=(10, 5)).astype('float32')
 		    exe.run(program=fluid.default_main_program(),
 			feed={'x': data}, 
 			fetch_list=[cost.name])
-
 		# usage 1
 		with ema.apply(exe):
 		    data = numpy.random.random(size=(10, 5)).astype('float32')
@@ -3553,7 +3249,6 @@ class ExponentialMovingAverage(object):
 			    feed={'x': data}, 
 			    fetch_list=[hidden.name])
 			    
-
 		 # usage 2
 		with ema.apply(exe, need_restore=False):
 		    data = numpy.random.random(size=(10, 5)).astype('float32')
@@ -3718,12 +3413,10 @@ class ExponentialMovingAverage(object):
 class PipelineOptimizer(object):
     """
 	:api_attr: Static Graph
-
     Pipeline Optimizer: Make a program to run as pipeline, that is splitting a
     program into multiple sections (sub-programs) and each section run on a
     device to enable the training of large scale models and the use of
     heterogeneous devices. Meanwhile, all sections run in the stype of pipeline.
-
     Args:
         optimizer (Optimizer): The optimizer to use, such as SGD.
         num_microbatches (int): Number of microbatches. [Optional. Default:1].
@@ -3731,10 +3424,8 @@ class PipelineOptimizer(object):
     
     Examples:
         .. code-block:: python
-
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
-
             with fluid.device_guard("gpu:0"):
                 x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
                 y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
@@ -3743,10 +3434,8 @@ class PipelineOptimizer(object):
                     capacity=64,
                     use_double_buffer=True,
                     iterable=False)
-
                 emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False)
                 emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False)
-
             with fluid.device_guard("gpu:1"):
                 concat = layers.concat([emb_x, emb_y], axis=1)
                 fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False)
@@ -3754,14 +3443,12 @@ class PipelineOptimizer(object):
             optimizer = fluid.optimizer.SGD(learning_rate=0.5)
             optimizer = fluid.optimizer.PipelineOptimizer(optimizer)
             optimizer.minimize(loss)
-
             def train_reader():
                 for _ in range(4):
                     x = np.random.random(size=[1]).astype('int64')
                     y = np.random.random(size=[1]).astype('int64')
                     yield x, y
             data_loader.set_sample_generator(train_reader, batch_size=1)
-
             place = fluid.CUDAPlace(0)
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -3946,7 +3633,6 @@ class PipelineOptimizer(object):
         """
         Split a program into sections according to devices that ops run on.
         The op whose op_device attr is "gpu:all" is copied to all sections.
-
         Args:
             main_program (Program): the main program
             devices: all used devices
@@ -4023,7 +3709,6 @@ class PipelineOptimizer(object):
     def _find_post_op(self, ops, cur_op, var_name):
         """
         Find the real post op that has variable named var_name as input.
-
         Args:
             ops (list): A list of ops.
             cur_op (Operator): Current operator which has variable named
@@ -4054,7 +3739,6 @@ class PipelineOptimizer(object):
     def _find_real_prev_op(self, ops, cur_op, var_name):
         """
         Find the real previous op that outputs variable named var_name.
-
         Args:
             ops (list): A list of ops.
             cur_op (Operator): Current operator which has variable named
@@ -4786,31 +4470,24 @@ class PipelineOptimizer(object):
 class RecomputeOptimizer(Optimizer):
     """
 	:api_attr: Static Graph
-
     Recompute Optimizer Wrapper
-
     Normally, a training step contains three sub-steps: first, run forward
     Operators to calculate the loss; second, run backward Operators to 
     calculate gradient of the parameters; third, apply optimization method
     to update the value of the parameters.
-
     In the forward computation process, all variables that are needed by 
     backward computation process will be kept in memory, which occupy a great
     amount of memory when the network becomes very deep.
-
     Recompute split the network to k segments. In each segment, It will 
     recompute the forward Operators, before running backward operators. It is
     very helpful for saving memory.
  
     The Variables that separate a network to segments are called as checkpoints,
     and users should set it manually. The usage is very simple:
-
     Args:
         optimizer (Optimizer): The optimizer that is applied to parameters.
-
     Examples:
         .. code-block:: python
-
             import paddle.fluid as fluid
             import numpy as np
             def gen_data():
@@ -4826,24 +4503,20 @@ class RecomputeOptimizer(Optimizer):
             input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
             input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
             cost, fc_1, pred = mlp(input_x, input_y)
-
             sgd = fluid.optimizer.Adam(learning_rate=0.01)
             sgd = fluid.optimizer.RecomputeOptimizer(sgd)
             sgd._set_checkpoints([fc_1, pred])
             sgd.minimize(cost)
-
             print("Finished optimize")
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
             step = 10
-
             for i in range(step):
                 cost_val = exe.run(feed=gen_data(),
                        program=fluid.default_main_program(),
                        fetch_list=[cost.name])
                 print("step=%d cost=%f" % (i, cost_val[0]))
-
     """
 
     def __init__(self, optimizer):
@@ -4877,16 +4550,12 @@ class RecomputeOptimizer(Optimizer):
     def load(self, state_dict):
         """
 	    :api_attr: Static Graph
-
         load function is not supported by Recompute Optimizer for now.
         :return: None
-
         Args:
             state_dict: the dict load by load_persistable method
-
         Examples:
             .. code-block:: python
-
                 import paddle.fluid as fluid
                 import paddle.compat as cpt
                 
@@ -4917,32 +4586,24 @@ class RecomputeOptimizer(Optimizer):
     def apply_gradients(self, params_grads):
         """
         call apply_gradients function of self._optimizer.
-
         Args:
             params_grads (list): list of (param, grad) pair to do optimization.
-
         Returns:
             list: A list of operators appended to the current program.
-
         Examples:
             .. code-block:: python
-
                 import paddle.fluid as fluid
                 import paddle.fluid.framework as framework
-
                 def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                     fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
                     prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
                     cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
                     sum_cost = fluid.layers.reduce_mean(cost)
                     return sum_cost, fc_1, prediction
-
-
                 input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
                 input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
                 cost, fc_1, pred = mlp(input_x, input_y)
                 print("Finished FF")
-
                 sgd = fluid.optimizer.Adam(learning_rate=0.01)
                 sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                 sgd._set_checkpoints([fc_1, pred])
@@ -4951,11 +4612,9 @@ class RecomputeOptimizer(Optimizer):
                     startup_program=None,
                     parameter_list=None,
                     no_grad_set=None)
-
                 program = cost.block.program
                 with framework.program_guard(program, None):
                     optimize_ops = sgd.apply_gradients(params_grads)
-
                 print("Finished apply gradients")
         """
 
@@ -4984,7 +4643,6 @@ class RecomputeOptimizer(Optimizer):
     def _append_fill_constant_ops(self, startup_program):
         """
         add fill_constant_ops to the end of the prog
-
         we should fill the pinned vars before runing the main_prog
         to instantiate their tensor hold_, which could tell us whether 
         the host memory could hold all the checkpoints from all the 
@@ -5323,7 +4981,6 @@ class RecomputeOptimizer(Optimizer):
                  callbacks=None):
         """
         call append_backward with checkpoints.
-
         Args:
             loss (Variable): loss variable to run optimizations.
             startup_program (Program): startup_program for initializing parameters
@@ -5333,10 +4990,8 @@ class RecomputeOptimizer(Optimizer):
             callbacks (list|None): list of callables to run when appending backward
                 operator for one parameter.
             checkpoints (list): list of Variables as checkpoints
-
         Examples:
             .. code-block:: python
-
                 import paddle.fluid as fluid
     
                 def mlp(input_x, input_y, hid_dim=128, label_dim=2):
@@ -5467,34 +5122,27 @@ class RecomputeOptimizer(Optimizer):
 class LookaheadOptimizer(object):
     r"""
 	:api_attr: Static Graph
-
     This implements the Lookahead optimizer of the
     paper : https://arxiv.org/abs/1907.08610.
-
     Lookahead keeps two sets of params: the fast_params and
     the slow_params. inner_optimizer update fast_params every 
     training step. Lookahead updates the slow_params and fast_params 
     every k training steps as follows:
-
     .. math::
         
         slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1})
 	
 	fast\_param_t &=  slow\_param_t
-
     Args:
         inner_optimizer (Optimizer): The optimizer that update fast params step by step. 
         alpha (float): The learning rate of Lookahead.
         k (int): The slow params is updated every k steps.
-
     Examples:
         .. code-block:: python
-
             import paddle
             import paddle.fluid as fluid
             import numpy as np
             import numpy.random as random
-
             paddle.enable_static()
         
             x = fluid.layers.data(name='x', shape=[2], dtype='float32')
@@ -5511,7 +5159,6 @@ class LookaheadOptimizer(object):
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
-
             def train_reader(limit=5):
                 for i in range(limit):
                     yield random.random([2]).astype('float32'), random.random([1]).astype('int64')
@@ -5522,7 +5169,6 @@ class LookaheadOptimizer(object):
             for batch_data in reader():
                 exe.run(fluid.default_main_program(),
                 feed=feeder.feed(batch_data))
-
     """
 
     def __init__(self, inner_optimizer, alpha=0.5, k=5):
@@ -5641,49 +5287,39 @@ class GradientMergeOptimizer(object):
     Gradient Merge, also called as Gradient Accumulation,
     is a training strategy for larger batches. With this strategy,
     the parameter will not be updated until specific steps.
-
     For each step, the forward network and the backward network
     will run to calculate the gradient of the parameters.
-
     For every k step, the optimization network will run,
     applying a specific optimization method (such as SGD, Adam)
     to the parameters.
-
     Args:
         inner_optimizer (Optimizer): The specific optimization (such as SGD, Adam)
             which update the parameters
         k_steps (int): the update period of the parameters
         avg (bool): whether to average the gradients of each mini-batch,
             the default value is `True`
-
     Examples:
         .. code-block:: python
-
         import paddle.fluid as fluid
         import numpy as np
-
         def gen_data(batch_size):
             return {"x": np.random.random(size=(batch_size, 32)).astype('float32'),
                     "y": np.random.random(size=(batch_size, 1)).astype('int64')}
-
         def mlp(input_x, input_y, hid_dim=128, label_dim=2):
             fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
             prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
             cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
             sum_cost = fluid.layers.reduce_mean(cost)
             return sum_cost, fc_1, prediction
-
         input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
         input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
         cost, fc_1, pred = mlp(input_x, input_y)
         sgd = fluid.optimizer.Adam(learning_rate=0.01)
         sgd = fluid.optimizer.GradientMergeOptimizer(sgd, k_steps=4, avg=True)
         sgd.minimize(cost)
-
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-
         for i in range(10):
             cost_val = exe.run(feed=gen_data(32),
                        program=fluid.default_main_program(),
@@ -5962,4 +5598,4 @@ class GradientMergeOptimizer(object):
         optimize_ops = self.apply_optimize(
             loss, startup_program=startup_program, params_grads=params_grads)
 
-        return optimize_ops, params_grads
+        return optimize_ops, params_grads
\ No newline at end of file
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index ceaf4bbdfebe6c2396a93c51b369beb9b2cdbef4..acfa4c93177d4dcf2f6865c6748e1a5541931244 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -21,4 +21,4 @@ py_test(test_custom_conj SRCS test_custom_conj.py)
 # other tests
 py_test(test_sysconfig SRCS test_sysconfig.py)
 py_test(test_check_abi SRCS test_check_abi.py)
-cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
+cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
\ No newline at end of file
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
index 8c3a66f933f59ddb01a624c57c3b1573e71c953e..4a4ea10d0711bad084afc22146f1c17bd3a86389 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
@@ -156,4 +156,162 @@ class TestDistMnist2x2(TestDistRunnerBase):
 
 
 if __name__ == "__main__":
-    runtime_main(TestDistMnist2x2)
+    runtime_main(TestDistMnist2x2)#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    with fluid.device_guard("gpu:1"):
+        predict = fluid.layers.fc(
+            input=conv_pool_2,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        # To cover @RENAMED@GRADIENT
+        predict2 = fluid.layers.fc(
+            input=conv_pool_1,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        predict += predict2
+    return predict
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        with fluid.device_guard("gpu:0"):
+            images = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+            if dist_strategy:
+                data_loader = fluid.io.DataLoader.from_generator(
+                    feed_list=[images, label],
+                    capacity=64,
+                    use_double_buffer=False,
+                    iterable=False)
+            # Train program
+            predict = cnn_model(images)
+        with fluid.device_guard("gpu:1"):
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        with fluid.device_guard("gpu:1"):
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(
+                input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+        base_lr = self.lr
+        passes = [30, 60, 80, 90]
+        steps_per_pass = 10
+        bd = [steps_per_pass * p for p in passes]
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+        lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+        opt = fluid.optimizer.Momentum(
+            learning_rate=lr_val,
+            momentum=0.9,
+            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+
+        acc_steps = 2  # accumulated steps for pipeline
+        if dist_strategy:
+            # Reader
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size)
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.pipeline = True
+            strategy.amp = True
+            strategy.pipeline_configs = {
+                'micro_batch_size': batch_size,
+                'schedule_mode': '1F1B',
+                'accumulate_steps': acc_steps
+            }
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+            # Reader
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
+
+        if dist_strategy:
+            return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader
+        else:
+            return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
\ No newline at end of file
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index bcf80fa4771d364c00bb9adff9a68d938b724aba..ea183e9444878d032bd2ff63cc6fe5b085e9f9a5 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -1971,9 +1971,9 @@ class TestPow_factor_tensor(TestActivation):
             feed={"x": input},
             fetch_list=[out_1, out_2, res, out_6])
 
-        assert np.array_equal(res_1, np.power(input, 2))
-        assert np.array_equal(res_2, np.power(input, 3))
-        assert np.array_equal(res_6, np.power(input, 3))
+        assert np.allclose(res_1, np.power(input, 2))
+        assert np.allclose(res_2, np.power(input, 3))
+        assert np.allclose(res_6, np.power(input, 3))
 
     def test_error(self):
         in1 = fluid.layers.data(
diff --git a/python/setup.py.in b/python/setup.py.in
index 73c773bab494d035dacc9d122f67d365700c160e..45fbf16f264f9ec1ec1cd9e46ed7b3e136a5d32a 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -568,4 +568,4 @@ with redirect_stdout():
 # we don't print them on the screen, and you can open `setup.py.log`
 # for the full logs.
 if os.path.exists('${SETUP_LOG_FILE}'):
-    os.system('grep -v "purelib" ${SETUP_LOG_FILE}')
+    os.system('grep -v "purelib" ${SETUP_LOG_FILE}')
\ No newline at end of file