From 491af67732bd6d1959e9f386b0ec39e6cdfb1fda Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Fri, 14 Jan 2022 14:24:17 +0800
Subject: [PATCH] mirgate_38878

---
 .../eager/accumulation/accumulation_node.cc   |   1 -
 .../accumulation/gradient_accumulation.cc     | 337 ++++++++++++++
 .../eager_generated/backwards/scale_node.cc   | 172 +++++++
 .../eager_generated/forwards/scale.cc         |  99 ++++
 paddle/fluid/eager/eager_tensor.h             |   1 -
 paddle/fluid/eager/grad_node_info.h           |   1 -
 .../eager/legacy/infer_var_type_context.h     | 260 +++++++++++
 paddle/fluid/eager/legacy/prepared_operator.h |  82 ++++
 paddle/fluid/eager/legacy/tensor_helper.h     |  33 ++
 .../framework/data_device_transform_test.cu   |  11 +-
 paddle/fluid/framework/operator.h             |   3 +-
 paddle/fluid/imperative/layer.h               |  21 +-
 paddle/fluid/imperative/op_base.h             |  19 +
 paddle/fluid/imperative/prepared_operator.h   |  25 +-
 paddle/fluid/operators/cast_op.h              |   1 -
 paddle/fluid/operators/conj_op.h              |   3 +-
 paddle/fluid/operators/dot_op.h               |   1 -
 .../elementwise/elementwise_add_op.h          |   1 -
 .../elementwise/elementwise_mul_op.h          |   1 -
 .../elementwise/elementwise_op_function.h     |   1 -
 .../elementwise/elementwise_op_impl.cu.h      |   1 -
 .../elementwise/elementwise_sub_op.h          |   1 -
 paddle/fluid/operators/fill_any_like_op.h     |   1 -
 paddle/fluid/operators/flatten_op.h           |   1 -
 paddle/fluid/operators/matmul_v2_op.h         |   1 -
 paddle/fluid/operators/reduce_ops/reduce_op.h |   2 -
 paddle/fluid/operators/reshape_op.cc          |   1 -
 paddle/fluid/operators/scale_op.h             |  31 +-
 paddle/fluid/operators/sign_op.h              |   1 -
 paddle/fluid/pybind/eager.cc                  |   1 -
 paddle/fluid/pybind/eager_functions.cc        |   1 -
 paddle/fluid/pybind/eager_method.cc           |   1 -
 paddle/fluid/pybind/eager_properties.cc       |   1 -
 paddle/fluid/pybind/eager_utils.cc            |   1 -
 paddle/pten/CMakeLists.txt                    |   2 +-
 paddle/pten/all.cc                            |   0
 paddle/pten/all.h                             |  20 -
 paddle/pten/api/lib/utils.cc                  |  81 ++++
 paddle/pten/include/core.h                    |   0
 paddle/pten/include/infermeta.h               |  21 -
 paddle/pten/include/math.h                    |  39 --
 paddle/pten/kernels/complex_kernel.h          |   3 -
 paddle/pten/kernels/cpu/scale_kernel.cc       |  65 +++
 paddle/pten/kernels/flatten_kernel.h          |   2 +-
 paddle/pten/kernels/gpu/scale_kernel.cu       |  14 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    |   3 +-
 paddle/pten/kernels/impl/scale_kernel_impl.h  |  50 --
 paddle/pten/kernels/math_kernel.h             |   3 +-
 paddle/pten/kernels/reshape_kernel.h          |   2 +-
 paddle/pten/kernels/scale_kernel.h            |  44 ++
 paddle/pten/kernels/sign_kernel.h             |   2 +-
 paddle/pten/tests/api/scale_api.h             | 279 +++++++++++
 .../pten/tests/kernels/test_scale_dev_api.cc  | 116 +++++
 python/paddle/utils/code_gen/api_gen.py       | 435 ++++++++++++++++++
 54 files changed, 2094 insertions(+), 205 deletions(-)
 delete mode 100644 paddle/pten/all.cc
 delete mode 100644 paddle/pten/all.h
 delete mode 100644 paddle/pten/include/core.h
 delete mode 100644 paddle/pten/include/infermeta.h
 delete mode 100644 paddle/pten/include/math.h
 delete mode 100644 paddle/pten/kernels/impl/scale_kernel_impl.h

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index ed1146eed0..823c0153d7 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -18,7 +18,6 @@
 
 #include "paddle/pten/api/all.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
index e69de29bb2..1f66596a0b 100644
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -0,0 +1,337 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/accumulation/gradient_accumulation.h"
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/api/all.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#ifdef PADDLE_WITH_XPU
+#include "xpu/refactor/math.h"
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#endif
+
+namespace egr {
+template <typename T>
+class TensorAddFunctor : public boost::static_visitor<> {
+ public:
+  TensorAddFunctor(int64_t numel, const T* x, T* y)
+      : numel_(numel), x_(x), y_(y) {}
+
+  void operator()(const paddle::platform::CPUPlace& place) {
+    paddle::platform::CPUDeviceContext* ctx =
+        dynamic_cast<paddle::platform::CPUDeviceContext*>(
+            paddle::platform::DeviceContextPool::Instance().Get(place));
+    auto blas =
+        paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext, T>(
+            *ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+
+// TODO(jiabin): Support xpu here from gradient_accumulator.cc
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void operator()(const paddle::platform::CUDAPlace& place) {
+    paddle::platform::CUDADeviceContext* ctx =
+        dynamic_cast<paddle::platform::CUDADeviceContext*>(
+            paddle::platform::DeviceContextPool::Instance().Get(place));
+    auto blas =
+        paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext,
+                                         T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+#else
+  void operator()(const paddle::platform::CUDAPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
+  // TODO(jiabin): Support Npu here from gradient_accumulator.cc
+  // there is NO blas in CUDAPinnedPlace
+  void operator()(const paddle::platform::CUDAPinnedPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  void operator()(const paddle::platform::NPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const paddle::platform::NPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
+#ifdef PADDLE_WITH_XPU
+  void operator()(const paddle::platform::XPUPlace& place) {
+    paddle::platform::XPUDeviceContext* ctx =
+        dynamic_cast<paddle::platform::XPUDeviceContext*>(
+            paddle::platform::DeviceContextPool::Instance().Get(place));
+    xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_));
+  }
+#else
+  void operator()(const paddle::platform::XPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
+#ifdef PADDLE_WITH_MLU
+  void operator()(const paddle::platform::MLUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const paddle::platform::MLUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
+#ifdef PADDLE_WITH_IPU
+  void operator()(const paddle::platform::IPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const paddle::platform::IPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
+  void operator()(const paddle::platform::NPUPinnedPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+
+ private:
+  int64_t numel_;
+  const T* x_;
+  T* y_;
+};
+
+template <typename DeviceContext, typename T>
+void TensorAddImpl(const std::shared_ptr<pten::DenseTensor>& src,
+                   pten::DenseTensor* dst,
+                   const paddle::platform::Place& place) {
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  paddle::platform::DeviceContext* ctx = pool.Get(place);
+  auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
+  paddle::operators::math::ElementwiseAddTo<DeviceContext, T> func;
+  func(dev_ctx, *(src.get()), dst);
+}
+
+template <typename DeviceContext, typename T>
+void TensorAddImpl(const paddle::framework::Tensor& src,
+                   paddle::framework::Tensor* dst,
+                   const paddle::platform::Place& place) {
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  paddle::platform::DeviceContext* ctx = pool.Get(place);
+  auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
+  paddle::operators::math::ElementwiseAddTo<DeviceContext, T> func;
+  func(dev_ctx, src, dst);
+}
+
+void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
+  // TODO(jiabin): Support other tensor type later
+  std::shared_ptr<pten::DenseTensor> dst_tensor =
+      std::dynamic_pointer_cast<pten::DenseTensor>(dst->impl());
+  std::shared_ptr<pten::DenseTensor> src_tensor =
+      std::dynamic_pointer_cast<pten::DenseTensor>(src.impl());
+
+  auto numel = src_tensor->numel();
+
+  if (numel == 0) {
+    return;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      dst_tensor->numel(), numel,
+      paddle::platform::errors::PreconditionNotMet(
+          "The number of elements of source tensor and destination tensor "
+          "should be equal, but got the number of elements of source tensor is "
+          "%zu and the number of elements of destination tensor is %zu.",
+          numel, dst_tensor->numel()));
+
+  auto data_type = pten::TransToProtoVarType(src_tensor->dtype());
+  auto place = src_tensor->place();
+
+  PADDLE_ENFORCE_EQ(pten::TransToProtoVarType(dst_tensor->dtype()), data_type,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "The data type of source tensor and destination tensor "
+                        "should be equal, Otherwise, the calculation results "
+                        "will be incorrect."));
+
+#define PADDLE_TENSOR_ADD(cpp_type)                                          \
+  if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
+    TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(),     \
+                                    dst_tensor->mutable_data<cpp_type>());   \
+    boost::apply_visitor(func, place);                                       \
+    return;                                                                  \
+  }
+
+  // TODO(jiabin): Support NPU here
+  PADDLE_TENSOR_ADD(float);
+// NOTE(phlrain): xpu only support float
+#ifndef PADDLE_WITH_XPU
+  PADDLE_TENSOR_ADD(double);
+  // NOTE(chenweihang): only support complex grad tensor accumulated,
+  // support selected rows if needed in the future
+  PADDLE_TENSOR_ADD(paddle::platform::complex<float>);
+  PADDLE_TENSOR_ADD(paddle::platform::complex<double>);
+#endif
+#undef PADDLE_TENSOR_ADD
+
+  if (data_type == paddle::framework::proto::VarType::FP16) {
+    if (paddle::platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      return TensorAddImpl<paddle::platform::CUDADeviceContext,
+                           paddle::platform::float16>(src_tensor,
+                                                      dst_tensor.get(), place);
+#else
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Gradient accumulation of data type (%s) on place (%s) is not "
+          "supported in imperative mode",
+          paddle::framework::DataTypeToString(data_type), place));
+#endif
+    } else if (paddle::platform::is_cpu_place(place)) {
+      return TensorAddImpl<paddle::platform::CPUDeviceContext,
+                           paddle::platform::float16>(src_tensor,
+                                                      dst_tensor.get(), place);
+    }
+  }
+  PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      "Gradient accumulation of data type (%s) on place (%s) is not "
+      "supported in imperative mode",
+      paddle::framework::DataTypeToString(data_type), place));
+}
+
+void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
+  // TODO(jiabin): Support other tensor type later
+  auto* dst_tensor =
+      dst->MutableVar()->GetMutable<paddle::framework::LoDTensor>();
+  auto& src_tensor = src.Var().Get<paddle::framework::LoDTensor>();
+
+  auto numel = src_tensor.numel();
+
+  // FIXME(minqiyang): loss_grad op will pass a zero grad of label
+  // ugly fix for it
+  if (numel == 0) {
+    return;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      dst_tensor->numel(), numel,
+      paddle::platform::errors::PreconditionNotMet(
+          "The number of elements of source tensor and destination tensor "
+          "should be equal, but got the number of elements of source tensor is "
+          "%zu and the number of elements of destination tensor is %zu.",
+          numel, dst_tensor->numel()));
+
+  auto data_type = src_tensor.type();
+  auto place = src_tensor.place();
+
+  PADDLE_ENFORCE_EQ(dst_tensor->type(), data_type,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "The data type of source tensor and destination tensor "
+                        "should be equal, Otherwise, the calculation results "
+                        "will be incorrect."));
+
+#define PADDLE_TENSOR_ADD(cpp_type)                                          \
+  if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
+    TensorAddFunctor<cpp_type> func(                                         \
+        numel, src_tensor.data<cpp_type>(),                                  \
+        dst_tensor->mutable_data<cpp_type>(place));                          \
+    boost::apply_visitor(func, place);                                       \
+    return;                                                                  \
+  }
+
+  // TODO(jiabin): Support NPU here
+  PADDLE_TENSOR_ADD(float);
+// NOTE(phlrain): xpu only support float
+#ifndef PADDLE_WITH_XPU
+  PADDLE_TENSOR_ADD(double);
+  // NOTE(chenweihang): only support complex grad tensor accumulated,
+  // support selected rows if needed in the future
+  PADDLE_TENSOR_ADD(paddle::platform::complex<float>);
+  PADDLE_TENSOR_ADD(paddle::platform::complex<double>);
+#endif
+#undef PADDLE_TENSOR_ADD
+
+  if (data_type == paddle::framework::proto::VarType::FP16) {
+    if (paddle::platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      return TensorAddImpl<paddle::platform::CUDADeviceContext,
+                           paddle::platform::float16>(src_tensor, dst_tensor,
+                                                      place);
+#else
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Gradient accumulation of data type (%s) on place (%s) is not "
+          "supported in imperative mode",
+          paddle::framework::DataTypeToString(data_type), place));
+#endif
+    } else if (paddle::platform::is_cpu_place(place)) {
+      return TensorAddImpl<paddle::platform::CPUDeviceContext,
+                           paddle::platform::float16>(src_tensor, dst_tensor,
+                                                      place);
+    }
+  }
+  PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      "Gradient accumulation of data type (%s) on place (%s) is not "
+      "supported in imperative mode",
+      paddle::framework::DataTypeToString(data_type), place));
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index e69de29bb2..99f6c7a835 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/eager_tensor.h"
+
+#include "paddle/pten/kernels/scale_kernel.h"
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+
+#include "glog/logging.h"
+
+namespace egr {
+
+template <typename DeviceContext>
+static void ScaleDeviceDispatch(const pten::DenseTensor& dense_tensor,
+                                const DeviceContext& dev_ctx, float scale,
+                                float bias, bool bias_after_scale,
+                                pten::DenseTensor* dense_out) {
+  switch (dense_tensor.dtype()) {
+    case pten::DataType::FLOAT64: {
+      pten::ScaleKernel<double, DeviceContext>(
+          dev_ctx, dense_tensor /* tensor */, scale /* scale */,
+          bias /* bias */, bias_after_scale /* bias_after_scale */,
+          dense_out /* out tensor */);
+      break;
+    }
+    case pten::DataType::FLOAT32: {
+      pten::ScaleKernel<float, DeviceContext>(
+          dev_ctx, dense_tensor /* tensor */, scale /* scale */,
+          bias /* bias */, bias_after_scale /* bias_after_scale */,
+          dense_out /* out tensor */);
+      break;
+    }
+    case pten::DataType::INT64: {
+      pten::ScaleKernel<int64_t, DeviceContext>(
+          dev_ctx, dense_tensor /* tensor */, scale /* scale */,
+          bias /* bias */, bias_after_scale /* bias_after_scale */,
+          dense_out /* out tensor */);
+      break;
+    }
+    case pten::DataType::INT32: {
+      pten::ScaleKernel<int32_t, DeviceContext>(
+          dev_ctx, dense_tensor /* tensor */, scale /* scale */,
+          bias /* bias */, bias_after_scale /* bias_after_scale */,
+          dense_out /* out tensor */);
+      break;
+    }
+    default: {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Detected unsupported data type."
+          "Only Float64, Float32, Int64, Int32 are supported for now."));
+      break;
+    }
+  }
+}
+
+void ScaleAPI(const egr::EagerTensor& x, float scale, float bias,
+              bool bias_after_scale, egr::EagerTensor* out) {
+  // TODO(jiabin): Support multiple tensor here, Create DenseTensor is not a
+  // proper way to Demo it
+  // Run Forward Function
+  auto dense_tensor = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  // Init output tensor
+  auto tensor_meta = pten::DenseTensorMeta(
+      dense_tensor->dtype(), dense_tensor->dims(), dense_tensor->layout());
+  auto place = dense_tensor->place();
+  size_t bytes_size = paddle::framework::product(dense_tensor->dims()) *
+                      SizeOf(dense_tensor->dtype());
+  auto dense_out = std::make_shared<pten::DenseTensor>(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          paddle::memory::Alloc(place, bytes_size)),
+      std::move(tensor_meta));
+  // Handle Device Context
+  const paddle::platform::Place& expected_kernel_place =
+      Controller::Instance().GetExpectedPlace();
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+
+  if (expected_kernel_place == paddle::platform::CPUPlace()) {
+    auto* dev_ctx = dynamic_cast<paddle::platform::CPUDeviceContext*>(
+        pool.Get(expected_kernel_place));
+    if (!dev_ctx) {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Cannot convert device_context to CPUDeviceContext."
+          "This indicates backend mismatch."
+          "Pleas double check your expected place"));
+    }
+    ScaleDeviceDispatch<paddle::platform::CPUDeviceContext>(
+        *dense_tensor.get(), *dev_ctx, scale, bias, bias_after_scale,
+        dense_out.get());
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
+    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+        pool.Get(expected_kernel_place));
+    if (!dev_ctx) {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Cannot convert device_context to CUDADeviceContext."
+          "This indicates backend mismatch."
+          "Pleas double check your expected place"));
+    }
+    ScaleDeviceDispatch<paddle::platform::CUDADeviceContext>(
+        *dense_tensor.get(), *dev_ctx, scale, bias, bias_after_scale,
+        dense_out.get());
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Detected unsupported backend."
+        "Only CPU and CUDA Backend are supported for now."
+        "Please double check if your backend falls into the above two "
+        "categories."));
+  }
+
+  out->set_impl(dense_out);
+}
+
+void GradNodeScale::SetTensorWrappers_X(
+    const std::vector<egr::EagerTensor>& tensors) {
+  // Does nothing for scale
+}
+
+void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
+
+std::vector<std::vector<egr::EagerTensor>> GradNodeScale::operator()(
+    const std::vector<std::vector<egr::EagerTensor>>& grads) {
+  // 1. Check Output Size
+  PADDLE_ENFORCE(
+      ((grads.size() == 1) && (grads[0].size() == 1)),
+      paddle::platform::errors::Fatal(
+          "ScaleGradNode takes exactly 1 grad tensor."
+          "However received: %d",
+          "This indicates an issue with Eager Dygraph Backward logic",
+          grads.size()));
+  std::vector<std::vector<egr::EagerTensor>> outs;
+  // 2. Create needed out parttern
+  egr::EagerTensor out;
+  // Apply Gradient Hooks
+  if (GradientHooksRegistered()) {
+    // TODO(jiabin): Shall we apply hook slot by slot here or accept
+    // vector<vector<pten::tensor>> to apply all hooks?
+    std::vector<std::vector<egr::EagerTensor>> hooked_grads =
+        ApplyGradientHooks(grads);
+    ScaleAPI(/* slot by slot set */ hooked_grads[0][0], scale_, 0.0 /* bias */,
+             true /* bias_after_scale */, &out);
+  } else {
+    ScaleAPI(grads[0][0], scale_, 0.0 /* bias */, true /* bias_after_scale */,
+             &out);
+  }
+
+  // Apply Reduce Hooks
+  if (ReduceHooksRegistered()) {
+    ApplyReduceHooks();
+  }
+  return {{out}};
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index e69de29bb2..642302a411 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This File should be automatically generated by coding auto generator.
+ * All ops C++ autograd logic is defined here, in Python-C extension API
+ * system we try to avoid any autograd related code, and move them all to
+ * here.
+ *
+ * Currently, we just manually do some fwd autograd here. And we will replace
+ * them with auto code generator later.
+ * **/
+
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/scale.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/utils.h"
+
+#include "paddle/pten/api/all.h"
+
+namespace egr {
+
+egr::EagerTensor scale(const egr::EagerTensor& x, float scale, float bias,
+                       bool bias_after_scale, bool trace_backward) {
+  // 1. Run Forward
+  // 1.1 Create outputs
+  egr::EagerTensor out;
+  // 1.2 Need by original op, we assemble ins, outs, attrs here
+
+  // 1.3 Call forward C++ api
+  ScaleAPI(x, scale, bias, bias_after_scale, &out);
+
+  // 2. Build Backward Depends
+  // 2.1 Get AutogradMetas for all ins and outs
+  auto p_autograd_in = EagerUtils::unsafe_autograd_meta(x);
+  // NOTE: Call EagerUtils::multi_autograd_meta  when we have vector of outputs
+  auto p_autograd_out = EagerUtils::autograd_meta(&out);
+
+  // 2.2 Add GradNode
+  // 2.2.1 ComputeRequireGrad
+  // TODO(jiabin) : make this function accept different kinds of input
+  // TODO(zhanlve): which one is more efficient:
+  //                1. construct a vector of pointers
+  //                2. call "ComputeRequireGrad" multiple times
+  bool require_any_grad =
+      EagerUtils::ComputeRequireGrad(trace_backward, p_autograd_in);
+  if (require_any_grad) {
+    EagerUtils::PassStopGradient(false /*generate_grad*/, p_autograd_out);
+
+    // 2.2.2 Set OutRankInfo for outputs this needs to be as same as Edges's
+    // input_rank_
+    /** Note:
+    // 1. We provide EagerUtils::SetMultiOutRank(vector<AutogradMeta*>),
+    // since we have some of Operator has servel slot name with duplicate
+    outputs.
+    // 2. We call AutogradMeta's SetOutput Rank only when we have single output
+    with
+    // single slot name.
+    **/
+    p_autograd_out->SetSingleOutRankWithSlot(0, 0);
+
+    // Init GradNode
+    auto scale_node = std::make_shared<GradNodeScale>(/* fwd_in_slot_num */ 1,
+                                                      /* bwd_in_slot_num */ 1);
+
+    // Pass Attributes to GradNode
+    scale_node->SetAttributes_scale(scale);
+
+    // Set Next Edges
+    scale_node->AddEdges(p_autograd_in, /*slot id*/ 0);
+
+    // Set TensorWrappers
+    scale_node->SetTensorWrappers_X({x});
+
+    // Set Grad out rank as same as fwd input and set stop gradient to bwd
+    scale_node->SetGradOutMeta(*p_autograd_in, /*slot id*/ 0);
+    // Set Grad out rank as same as fwd input and set stop gradient to bwd
+    scale_node->SetGradInMeta(*p_autograd_out, /*slot id*/ 0);
+
+    // Set History for output set current Grad Node for
+    EagerUtils::SetHistory(p_autograd_out, scale_node);
+  }
+
+  return out;
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 80faad9080..c58c0b9e66 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 // pten deps
-#include "paddle/pten/all.h"
 #include "paddle/pten/api/all.h"
 #include "paddle/pten/api/lib/api_declare.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index f15c50ef75..5cf0b90220 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -16,7 +16,6 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/pten/api/all.h"
-#include "paddle/pten/include/core.h"
 
 namespace egr {
 /**
diff --git a/paddle/fluid/eager/legacy/infer_var_type_context.h b/paddle/fluid/eager/legacy/infer_var_type_context.h
index e69de29bb2..9d9cbeb38c 100644
--- a/paddle/fluid/eager/legacy/infer_var_type_context.h
+++ b/paddle/fluid/eager/legacy/infer_var_type_context.h
@@ -0,0 +1,260 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/legacy/tensor_helper.h"
+#include "paddle/fluid/eager/legacy/type_def.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/pten/api/all.h"
+
+namespace egr {
+namespace legacy {
+
+// infer var type context for imperative mode
+class TensorRuntimeInferVarTypeContext
+    : public paddle::framework::InferVarTypeContext {
+ public:
+  TensorRuntimeInferVarTypeContext(
+      const NameTensorMap& inputs, const NameTensorMap& outputs,
+      const paddle::framework::AttributeMap& attrs_map,
+      const paddle::framework::AttributeMap& default_attrs_map)
+      : InferVarTypeContext(nullptr, nullptr),
+        inputs_(inputs),
+        outputs_(outputs),
+        attrs_(attrs_map),
+        default_attrs_(default_attrs_map) {}
+
+  virtual ~TensorRuntimeInferVarTypeContext() {}
+
+  paddle::framework::Attribute GetAttr(const std::string& name) const override {
+    auto it = attrs_.find(name);
+
+    if (it == attrs_.end()) {
+      it = default_attrs_.find(name);
+      if (it == default_attrs_.end()) {
+        PADDLE_THROW(paddle::platform::errors::NotFound(
+            "Can not find [%s] in attributes.", name));
+      }
+    }
+
+    return it->second;
+  }
+
+  bool HasInput(const std::string& name) const override {
+    auto it = inputs_.find(name);
+    return (it != inputs_.end() && it->second.size() > 0);
+  }
+
+  bool HasOutput(const std::string& name) const override {
+    auto it = outputs_.find(name);
+    return (it != outputs_.end() && it->second.size() > 0);
+  }
+
+  size_t InputSize(const std::string& name) const {
+    return inputs_.at(name).size();
+  }
+
+  const std::string& InputVarName(const std::string& name,
+                                  const int index = 0) const {
+    // TODO(jiabin): Support this usage inputs_.at(name)[index]->Name()
+    auto it = inputs_.find(name);
+    PADDLE_ENFORCE_NE(it, inputs_.end(),
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Can not find [%s] in Input", name));
+    return inputs_.at(name)[index]->name();
+  }
+
+  bool InputTypeAnyOf(
+      const std::string& name,
+      paddle::framework::proto::VarType::Type type) const override {
+    auto& inputs = inputs_.at(name);
+    return std::any_of(
+        inputs.begin(), inputs.end(),
+        [&type](const std::shared_ptr<egr::EagerTensor>& var) {
+          return paddle::framework::ToVarType(var->Var().Type()) == type;
+        });
+  }
+
+  bool InputTypeAllOf(
+      const std::string& name,
+      paddle::framework::proto::VarType::Type type) const override {
+    auto& inputs = inputs_.at(name);
+    return std::all_of(
+        inputs.begin(), inputs.end(),
+        [&type](const std::shared_ptr<egr::EagerTensor>& var) {
+          return paddle::framework::ToVarType(var->Var().Type()) == type;
+        });
+  }
+
+  void SyncTypeAndDataType(const std::string& input_name,
+                           const std::string& output_name,
+                           int index = 0) override {
+    auto in_tensor = inputs_.at(input_name)[index];
+    auto out_tensor = outputs_.at(output_name)[index];
+    if (in_tensor != out_tensor) {
+      this->SetTensorType(
+          out_tensor, paddle::framework::ToVarType(in_tensor->Var().Type()));
+    }
+  }
+
+  void SetOutputType(const std::string& name,
+                     paddle::framework::proto::VarType::Type type,
+                     int index = 0) override {
+    if (index == paddle::framework::ALL_ELEMENTS) {
+      for (auto& item : outputs_.at(name)) {
+        this->SetTensorType(item, type);
+      }
+    } else {
+      auto& var = outputs_.at(name)[index];
+      this->SetTensorType(var, type);
+    }
+  }
+
+  void SetTensorType(std::shared_ptr<egr::EagerTensor> out,
+                     paddle::framework::proto::VarType::Type type) {
+    switch (type) {
+      case paddle::framework::proto::VarType::LOD_TENSOR: {
+        out->MutableVar()->GetMutable<paddle::framework::LoDTensor>();
+        break;
+      }
+      default: {
+        PADDLE_THROW(paddle::platform::errors::NotFound(
+            "Cannot found var type: %s while running runtime InferVarType",
+            paddle::framework::ToTypeName(type)));
+      }
+    }
+  }
+
+  paddle::framework::proto::VarType::Type GetInputType(
+      const std::string& name, const int& index = 0) const override {
+    return paddle::framework::ToVarType(inputs_.at(name)[index]->Var().Type());
+  }
+
+  paddle::framework::proto::VarType::Type GetOutputType(
+      const std::string& name, const int& index = 0) const override {
+    // TODO(jiabin): Support SelectedRows when we have it.
+    return paddle::framework::proto::VarType::LOD_TENSOR;
+  }
+
+  paddle::framework::proto::VarType::Type GetInputDataType(
+      const std::string& name, const int& index = 0) const override {
+    return inputs_.at(name)[index]
+        ->Var()
+        .Get<paddle::framework::LoDTensor>()
+        .type();
+  }
+
+  void SetOutputDataType(const std::string& name,
+                         paddle::framework::proto::VarType::Type type,
+                         int index = 0) override {
+    // TODO(jiabin): It seems doesn't make sense to set data_type in EagerMode.
+  }
+
+  bool IsDygraph() const override { return true; }
+
+ protected:
+  bool HasVar(const std::string& name) const override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "HasVar is not supported in runtime InferVarType"));
+  }
+
+  const std::vector<std::string>& InputVars(
+      const std::string& name) const override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "InputVars is not supported in runtime InferVarType"));
+  }
+
+  const std::vector<std::string>& OutputVars(
+      const std::string& name) const override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "OutputVars is not supported in runtime InferVarType"));
+  }
+
+  paddle::framework::proto::VarType::Type GetVarType(
+      const std::string& name) const override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Do not manipulate var in runtime InferVarType"));
+  }
+
+  void SetVarType(const std::string& name,
+                  paddle::framework::proto::VarType::Type type) override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Do not manipulate var in runtime InferVarType"));
+  }
+
+  paddle::framework::proto::VarType::Type GetVarDataType(
+      const std::string& name) const override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Do not manipulate var in runtime InferVarType"));
+  }
+
+  void SetVarDataType(const std::string& name,
+                      paddle::framework::proto::VarType::Type type) override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Do not manipulate var in runtime InferVarType"));
+  }
+
+  std::vector<paddle::framework::proto::VarType::Type> GetVarDataTypes(
+      const std::string& name) const override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "GetVarDataTypes is not supported in runtime InferVarType"));
+  }
+
+  void SetVarDataTypes(
+      const std::string& name,
+      const std::vector<paddle::framework::proto::VarType::Type>&
+          multiple_data_type) override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "SetVarDataTypes is not supported in runtime InferVarType"));
+  }
+
+  std::vector<int64_t> GetVarShape(const std::string& name) const override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Do not handle Shape in runtime InferVarType"));
+  }
+
+  void SetVarShape(const std::string& name,
+                   const std::vector<int64_t>& dims) override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Do not handle Shape in runtime InferVarType"));
+  }
+
+  int32_t GetVarLoDLevel(const std::string& name) const override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Do not handle LoDLevel in runtime InferVarType"));
+  }
+
+  void SetVarLoDLevel(const std::string& name, int32_t lod_level) override {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Do not handle LoDLevel in runtime InferVarType"));
+  }
+
+ private:
+  const NameTensorMap& inputs_;
+  const NameTensorMap& outputs_;
+  const paddle::framework::AttributeMap& attrs_;
+  const paddle::framework::AttributeMap& default_attrs_;
+};
+
+}  // namespace legacy
+}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/prepared_operator.h b/paddle/fluid/eager/legacy/prepared_operator.h
index e69de29bb2..0e00b52e04 100644
--- a/paddle/fluid/eager/legacy/prepared_operator.h
+++ b/paddle/fluid/eager/legacy/prepared_operator.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/eager/legacy/execution_context.h"
+#include "paddle/fluid/eager/legacy/type_def.h"
+#include "paddle/fluid/framework/data_transform.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/type_defs.h"
+
+DECLARE_bool(use_mkldnn);
+
+namespace paddle {
+namespace framework {
+class Tensor;
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
+namespace egr {
+namespace legacy {
+
+const paddle::framework::Tensor* GetTensorFromVar(
+    const paddle::framework::Variable& var);
+
+std::shared_ptr<NameTensorMap> PrepareData(
+    const paddle::framework::OperatorWithKernel& op, const NameTensorMap& ins,
+    const paddle::framework::OpKernelType& expected_kernel_key);
+
+class PreparedOp {
+ public:
+  PreparedOp(const paddle::framework::OperatorBase& op,
+             const paddle::framework::RuntimeContext& ctx,
+             const paddle::framework::OpKernelType& kernel_type,
+             const paddle::framework::OperatorWithKernel::OpKernelFunc& func,
+             paddle::platform::DeviceContext* dev_ctx);
+
+  static PreparedOp Prepare(
+      const NameTensorMap& ins, const NameTensorMap& outs,
+      const paddle::framework::OperatorWithKernel& op,
+      const paddle::platform::Place& place,
+      const paddle::framework::AttributeMap& attrs,
+      const paddle::framework::AttributeMap& default_attrs);
+
+  void Run(const NameTensorMap& in, const NameTensorMap& out,
+           const paddle::framework::AttributeMap& attrs,
+           const paddle::framework::AttributeMap& default_attrs);
+
+  const paddle::framework::OpKernelType& kernel_type() const {
+    return kernel_type_;
+  }
+
+ private:
+  const paddle::framework::OperatorBase& op_;
+  const paddle::framework::RuntimeContext& ctx_;
+  paddle::framework::OpKernelType kernel_type_;
+  paddle::framework::OperatorWithKernel::OpKernelFunc func_;
+  paddle::platform::DeviceContext* dev_ctx_;
+};
+
+}  // namespace legacy
+}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/tensor_helper.h b/paddle/fluid/eager/legacy/tensor_helper.h
index e69de29bb2..ce407f8965 100644
--- a/paddle/fluid/eager/legacy/tensor_helper.h
+++ b/paddle/fluid/eager/legacy/tensor_helper.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/pten/api/all.h"
+namespace egr {
+namespace legacy {
+
+void InitializeVariable(paddle::framework::Variable* var,
+                        paddle::framework::proto::VarType::Type var_type);
+paddle::framework::proto::VarType::Type GetDtypeFromVar(
+    const paddle::framework::Variable& var);
+const paddle::platform::Place& GetPlaceFromVar(
+    const paddle::framework::Variable& var);
+void CopyVariable(const paddle::framework::Variable& src_var,
+                  paddle::framework::Variable* dst_var);
+
+}  // namespace legacy
+}  // namespace egr
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index 4e5be2e535..858688dffd 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -23,6 +23,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 
+#include "paddle/fluid/framework/pten_utils.h"
+
 namespace paddle {
 namespace framework {
 
@@ -73,9 +75,12 @@ class TestKernel : public OpKernel<float> {
     output->Resize(input->dims());
     output->mutable_data<T>(ctx.GetPlace());
 
-    operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
-        input, input, output, ctx.template device_context<DeviceContext>(),
-        AddFunctor<T>());
+    auto pt_input = paddle::experimental::MakePtenDenseTensor(*input);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*output);
+
+    pten::funcs::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        *pt_input, *pt_input, pt_out.get(),
+        ctx.template device_context<DeviceContext>(), AddFunctor<T>());
     functor.Run();
   }
 };
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 8e69f96dfb..9d75c66beb 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -41,7 +41,8 @@ limitations under the License. */
 #include "paddle/utils/flat_hash_map.h"
 
 #include "paddle/pten/core/arg_map_context.h"
-#include "paddle/pten/include/core.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_factory.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 16580627ed..d27460aeec 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -25,6 +25,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -36,7 +37,6 @@
 #include "paddle/fluid/imperative/variable_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
-
 namespace paddle {
 namespace framework {
 class Variable;
@@ -211,6 +211,8 @@ class VarBase {
 
   framework::proto::VarType::Type DataType() const { return var_->DataType(); }
 
+  size_t ElementSize() const { return framework::SizeOfType(var_->DataType()); }
+
   void SetForwardDataType(framework::proto::VarType::Type data_type) {
     var_->SetForwardDataType(data_type);
   }
@@ -221,7 +223,10 @@ class VarBase {
 
   const platform::Place Place() const { return var_->Place(); }
 
-  void ClearGradient();
+  void ClearGradient(bool set_to_zero = true);
+
+  void _GradientSetEmpty(bool is_empty = true);
+  bool _IsGradientSetEmpty();
 
   std::shared_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
                                       const bool blocking) const;
@@ -230,6 +235,8 @@ class VarBase {
 
   void BumpInplaceVersion();
 
+  void _CopyGradientFrom(const imperative::VarBase& src);
+
   /* Hook related method: now only used for GradVarBase */
   bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); }
 
@@ -275,16 +282,6 @@ class VarBase {
   static ThreadSafeNameSet name_set_;
 };
 
-class Layer {
- public:
-  virtual ~Layer() {}
-
-  virtual std::vector<std::shared_ptr<VarBase>> Forward(
-      const std::vector<std::shared_ptr<VarBase>>& inputs) {
-    return {};
-  }
-};
-
 std::shared_ptr<GradOpNode> CreateGradOpNode(
     const framework::OperatorBase& op, const NameVarBaseMap& ins,
     const NameVarBaseMap& outs, const framework::AttributeMap& attrs,
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index acb125a829..cb76a82353 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -183,6 +183,21 @@ class OpBase {
                   const framework::AttributeMap& default_attrs,
                   const platform::Place& place);
 
+  static pten::KernelContext* GetKernelContext() { return &pt_kernel_context_; }
+
+  bool HasVoidFunctionPostHook() const {
+    return !void_function_post_hooks_.empty();
+  }
+
+  void AddVoidFunctionPostHook(std::shared_ptr<std::function<void()>>&& hook) {
+    void_function_post_hooks_.emplace_back(std::move(hook));
+  }
+
+  const std::vector<std::shared_ptr<std::function<void()>>>&
+  GetVoidFunctionPostHooks() const {
+    return void_function_post_hooks_;
+  }
+
  private:
   static const std::string& UnknownOpType() {
     static std::string kUnknownOpType{"unknown"};
@@ -197,6 +212,10 @@ class OpBase {
   std::unique_ptr<framework::OperatorBase> op_;
   platform::Place place_;
   size_t id_{-1UL};
+  // In order to reduce the compatibility phase
+  // performance overhead, temporarily cache KernelContext
+  static pten::KernelContext pt_kernel_context_;
+  std::vector<std::shared_ptr<std::function<void()>>> void_function_post_hooks_;
 };
 
 class GradOpNode {
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 53f876c498..29747e79ef 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -21,6 +21,8 @@
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
@@ -147,19 +149,29 @@ class PreparedOp {
              const framework::OperatorWithKernel::OpKernelFunc& func,
              platform::DeviceContext* dev_ctx);
 
+  PreparedOp(const framework::OperatorBase& op,
+             const framework::RuntimeContext& ctx,
+             const framework::OpKernelType& kernel_type,
+             const framework::KernelSignature& kernel_signature,
+             const pten::Kernel& pt_kernel,
+             pten::KernelContext* pt_kernel_context,
+             platform::DeviceContext* dev_ctx);
+
   static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
                             const NameVarMap<VarBase>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
-                            const framework::AttributeMap& default_attrs);
+                            const framework::AttributeMap& default_attrs,
+                            pten::KernelContext* pt_kernel_context = nullptr);
 
   static PreparedOp Prepare(const NameVarMap<VariableWrapper>& ins,
                             const NameVarMap<VariableWrapper>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
-                            const framework::AttributeMap& default_attrs);
+                            const framework::AttributeMap& default_attrs,
+                            pten::KernelContext* pt_kernel_context = nullptr);
 
   void Run(const NameVarMap<VarBase>& in, const NameVarMap<VarBase>& out,
            const framework::AttributeMap& attrs,
@@ -178,6 +190,15 @@ class PreparedOp {
   framework::OpKernelType kernel_type_;
   framework::OperatorWithKernel::OpKernelFunc func_;
   platform::DeviceContext* dev_ctx_;
+  // NOTE(chenweihang): Similar op members are used to adapt to
+  // new pten kernel, if there is a better design in the future,
+  // we may polish the implementation here
+  bool run_pten_kernel_{false};
+  framework::KernelSignature pt_kernel_signature_;
+  pten::Kernel pt_kernel_;
+  // In order to reduce the compatibility phase
+  // performance overhead, temporarily cache KernelContext
+  pten::KernelContext* pt_kernel_context_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 72aa9a195e..c54c811b25 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/transform.h"
 
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/cast_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h
index 71115c2eba..6df982abb8 100644
--- a/paddle/fluid/operators/conj_op.h
+++ b/paddle/fluid/operators/conj_op.h
@@ -19,7 +19,6 @@
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/complex_kernel.h"
 
 namespace paddle {
@@ -39,7 +38,7 @@ class ConjKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::ConjKernel<T, DeviceContext>(dev_ctx, *pt_x.get(), pt_out.get());
+    pten::ConjKernel<T>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 8817e2f3ca..ceb8a28e8a 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -21,7 +21,6 @@
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/dot_grad_kernel.h"
 #include "paddle/pten/kernels/dot_kernel.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 35807d7c57..622a6d7edb 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <utility>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 385c7549e0..687340b668 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 37d29ed91b..626046890f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/transform.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/kernels/cpu/elementwise.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 36ff1ae254..9cc741344e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -22,7 +22,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/gpu/elementwise.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 09818380d8..f035e46d1d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index 287bbbfa3b..19f6e7a4ef 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/pten_utils.h"
 
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/full_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index ef42619bfe..8e54ecb922 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 #include "paddle/pten/kernels/flatten_grad_kernel.h"
 #include "paddle/pten/kernels/flatten_kernel.h"
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index e93bd21286..9ab77cdcae 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -27,7 +27,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/matmul_grad_kernel.h"
 #include "paddle/pten/kernels/matmul_kernel.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index e1854d8a13..eb4d4a5c16 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -26,8 +26,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
 #include "paddle/pten/kernels/cpu/reduce.h"
 
 #if defined(__HIPCC__) || defined(__NVCC__)
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a25e53aac5..47b8da70ad 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/common/scalar_array.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/reshape_grad_kernel.h"
 #include "paddle/pten/kernels/reshape_kernel.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index e7a07810c6..a6f4f6e272 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/framework/pten_utils.h"
+
+// only can include the headers in paddle/top/api dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/kernels/scale_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -33,6 +36,7 @@ static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
   return tensor_data[0];
 }
 
+// See Note [ Why still keep the original kernel implementation? ]
 template <typename DeviceContext, typename T>
 class ScaleKernel : public framework::OpKernel<T> {
  public:
@@ -40,13 +44,13 @@ class ScaleKernel : public framework::OpKernel<T> {
     auto* in_var = ctx.InputVar("X");
     auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
 
-    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
+    auto bias = ctx.Attr<float>("bias");
     auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
 
-    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+    auto scale = ctx.Attr<float>("scale");
     if (ctx.HasInput("ScaleTensor")) {
       auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
-      scale = GetAttrFromTensor<T>(scale_tensor);
+      scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
     }
 
     auto* out_var = ctx.OutputVar("Out");
@@ -56,22 +60,17 @@ class ScaleKernel : public framework::OpKernel<T> {
       out_slr->set_rows(in_slr.rows());
       out_slr->set_height(in_slr.height());
     }
-
     auto* out =
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
     out->mutable_data<T>(in->place());
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
 
-    PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
-                      paddle::platform::errors::InvalidArgument(
-                          "the input and output should have the same dim"
-                          "but input dim is %s, output dim is %s",
-                          in->dims(), out->dims()));
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*in);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
-        dev, eigen_out, eigen_in, scale, bias, bias_after_scale);
+    // call new kernel
+    pten::ScaleKernel<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
+                         pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index b8dd44c01b..8294cd2c5f 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/sign_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 9484d506b2..102bc9f162 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 659df6b9b4..aaf86bc41a 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -34,7 +34,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index a0067f9c64..a8c1da2a8b 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -31,7 +31,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 namespace paddle {
 namespace pybind {
 
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 71b8bbbb1a..038a1254d7 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 #pragma GCC diagnostic ignored "-Wwrite-strings"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 9849d0d416..c1049d2407 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 6a823ff367..a9b7c7581b 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -29,4 +29,4 @@ get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
 message(STATUS "All standard pten kernels: ${pten_kernels}")
 set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
 
-cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
+cc_library(pten DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/all.cc b/paddle/pten/all.cc
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/paddle/pten/all.h b/paddle/pten/all.h
deleted file mode 100644
index c8be629b10..0000000000
--- a/paddle/pten/all.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// developer apis
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/include/math.h"
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index e69de29bb2..6eb1e5a379 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/include/utils.h"
+
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/infermeta/unary.h"
+
+PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
+#endif
+
+namespace paddle {
+namespace experimental {
+
+PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
+  // 1. Get kernel signature and kernel
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "copy", kernel_key);
+
+  VLOG(0) << "to API kernel key: " << kernel_key;
+  VLOG(0) << "to API kernel: " << kernel;
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackAttr(blocking);
+
+  // 4. InferMeta
+  auto out_meta = UnchangedInferMeta(dense_x->meta());
+
+  // 5. Prepare outputs
+  auto dense_out = std::make_shared<pten::DenseTensor>(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          pten::TransToFluidPlace(backend)),
+      std::move(out_meta));
+  kernel_context.EmplaceBackOutput(dense_out);
+  Tensor out;
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
+}  // namespace experimental
+}  // namespace paddle
+
+PT_REGISTER_API(Utils);
diff --git a/paddle/pten/include/core.h b/paddle/pten/include/core.h
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/paddle/pten/include/infermeta.h b/paddle/pten/include/infermeta.h
deleted file mode 100644
index 5e356dd37c..0000000000
--- a/paddle/pten/include/infermeta.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/infermeta/binary.h"
-#include "paddle/pten/infermeta/multiary.h"
-#include "paddle/pten/infermeta/nullary.h"
-#include "paddle/pten/infermeta/unary.h"
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
deleted file mode 100644
index a4fb7f4d98..0000000000
--- a/paddle/pten/include/math.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/scale_kernel.h"
-
-namespace pten {
-
-template <typename T, typename ContextT>
-DenseTensor Scale(const ContextT& dev_ctx,
-                  const DenseTensor& x,
-                  const Scalar& scale,
-                  float bias,
-                  bool bias_after_scale) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Scale<T, ContextT>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
-  return dense_out;
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h
index 9dd3d457e4..b6074f117e 100644
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -15,9 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/empty_kernel.h"
-
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc
index e69de29bb2..0582fb87b4 100644
--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/scale_kernel.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/bfloat16.h"
+namespace pten {
+
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 DenseTensor* out) {
+  // calc
+  out->mutable_data<T>();
+  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
+  auto& dev = *dev_ctx.eigen_device();
+  // TODO(chenweihang): now the eigen function here need the dtype of scale,
+  // eigen_x, bias should be same, so here need cast for two scalar arg,
+  // maybe we declare that the type of scale and bias is T?
+  paddle::operators::EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
+      dev,
+      eigen_out,
+      eigen_x,
+      scale.to<T>(),
+      static_cast<T>(bias),
+      bias_after_scale);
+}
+
+}  // namespace pten
+
+PT_REGISTER_CTX_KERNEL(scale,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::ScaleKernel,
+                       float,
+                       double,
+                       paddle::platform::bfloat16,
+                       uint8_t,
+                       int8_t,
+                       int16_t,
+                       int,
+                       int64_t) {}
diff --git a/paddle/pten/kernels/flatten_kernel.h b/paddle/pten/kernels/flatten_kernel.h
index a67e66fac4..c974fda1ed 100644
--- a/paddle/pten/kernels/flatten_kernel.h
+++ b/paddle/pten/kernels/flatten_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index 68574c063e..ff7e2a6ed2 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -44,12 +44,12 @@ struct ScaleFunctor {
 };
 
 template <typename T, typename ContextT>
-void Scale(const ContextT& dev_ctx,
-           const DenseTensor& x,
-           const Scalar& scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out) {
+void ScaleKernel(const ContextT& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 DenseTensor* out) {
   std::vector<const DenseTensor*> inputs;
   std::vector<DenseTensor*> outputs;
   inputs.emplace_back(&x);
@@ -67,7 +67,7 @@ void Scale(const ContextT& dev_ctx,
 PT_REGISTER_CTX_KERNEL(scale,
                        GPU,
                        ALL_LAYOUT,
-                       pten::Scale,
+                       pten::ScaleKernel,
                        float,
                        double,
                        paddle::platform::float16,
diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
index 802cc019d7..b1bae78ddc 100644
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-// #include "paddle/pten/kernels/complex_kernel.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/complex_kernel.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 #include "paddle/pten/kernels/impl/dot_grad_kernel_impl.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
diff --git a/paddle/pten/kernels/impl/scale_kernel_impl.h b/paddle/pten/kernels/impl/scale_kernel_impl.h
deleted file mode 100644
index 2e0b158b36..0000000000
--- a/paddle/pten/kernels/impl/scale_kernel_impl.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/funcs/eigen/common.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace pten {
-
-template <typename T, typename Context>
-void Scale(const Context& dev_ctx,
-           const DenseTensor& x,
-           const Scalar& scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out) {
-  // calc
-  out->mutable_data<T>();
-  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
-  auto eigen_x = pten::EigenVector<T>::Flatten(x);
-  auto& dev = *dev_ctx.eigen_device();
-  // TODO(chenweihang): now the eigen function here need the dtype of scale,
-  // eigen_x, bias should be same, so here need cast for two scalar arg,
-  // maybe we declare that the type of scale and bias is T?
-  paddle::operators::EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
-      dev,
-      eigen_out,
-      eigen_x,
-      scale.to<T>(),
-      static_cast<T>(bias),
-      bias_after_scale);
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
index f87d0a31b4..e01103fc5b 100644
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/binary.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/reshape_kernel.h b/paddle/pten/kernels/reshape_kernel.h
index faa51c69ad..293f6cd2ba 100644
--- a/paddle/pten/kernels/reshape_kernel.h
+++ b/paddle/pten/kernels/reshape_kernel.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/scale_kernel.h b/paddle/pten/kernels/scale_kernel.h
index e69de29bb2..ba16db566b 100644
--- a/paddle/pten/kernels/scale_kernel.h
+++ b/paddle/pten/kernels/scale_kernel.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+namespace pten {
+
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 DenseTensor* out);
+
+template <typename T, typename ContextT>
+DenseTensor Scale(const ContextT& dev_ctx,
+                  const DenseTensor& x,
+                  const Scalar& scale,
+                  float bias,
+                  bool bias_after_scale) {
+  auto out_meta = UnchangedInferMeta(x.meta());
+  auto dense_out = pten::Empty<T, ContextT>(dev_ctx, std::move(out_meta));
+  ScaleKernel<T, ContextT>(
+      dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/sign_kernel.h b/paddle/pten/kernels/sign_kernel.h
index ba205fc96a..304b640d2a 100644
--- a/paddle/pten/kernels/sign_kernel.h
+++ b/paddle/pten/kernels/sign_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h
index e69de29bb2..41143826c4 100644
--- a/paddle/pten/tests/api/scale_api.h
+++ b/paddle/pten/tests/api/scale_api.h
@@ -0,0 +1,279 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/common/scalar_array.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/scale_kernel.h"
+
+namespace paddle {
+namespace experimental {
+
+PADDLE_API Tensor scale_kernel_context(const Tensor& x,
+                                       const Scalar& scale,
+                                       float bias,
+                                       bool bias_after_scale) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "scale", {kernel_backend, kernel_layout, kernel_data_type});
+  VLOG(6) << "scale API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  VLOG(6) << "scale API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+
+  kernel_context.EmplaceBackAttr(pten::Scalar(scale));
+  kernel_context.EmplaceBackAttr(bias);
+  kernel_context.EmplaceBackAttr(bias_after_scale);
+
+  auto out_meta = pten::UnchangedInferMeta(dense_x->meta());
+  auto dense_out = std::make_shared<pten::DenseTensor>(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          pten::TransToFluidPlace(kernel_backend)),
+      std::move(out_meta));
+  kernel_context.EmplaceBackOutput(dense_out);
+
+  Tensor out;
+  out.set_impl(dense_out);
+
+  kernel(&kernel_context);
+  return out;
+}
+
+static void ScaleCPU(DataType kernel_dtype,
+                     const pten::CPUContext& dev_ctx,
+                     const pten::DenseTensor& x,
+                     const Scalar& scale,
+                     float bias,
+                     bool bias_after_scale,
+                     pten::DenseTensor* dense_out) {
+  switch (kernel_dtype) {
+    case pten::DataType::FLOAT64: {
+      pten::ScaleKernel<double>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::FLOAT32: {
+      pten::ScaleKernel<float>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::BFLOAT16: {
+      pten::ScaleKernel<paddle::platform::bfloat16>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::INT64: {
+      pten::ScaleKernel<int64_t>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::INT32: {
+      pten::ScaleKernel<int32_t>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::INT16: {
+      pten::ScaleKernel<int16_t>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::INT8: {
+      pten::ScaleKernel<int8_t>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::UINT8: {
+      pten::ScaleKernel<uint8_t>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    default: {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Detected unsupported data type."
+          "Only Float64, Float32, BFloat16, Int64, Int32, Int16, Int8, UInt8 "
+          "are supported for now."));
+      break;
+    }
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+static void ScaleGPU(DataType kernel_dtype,
+                     const pten::GPUContext& dev_ctx,
+                     const pten::DenseTensor& x,
+                     const Scalar& scale,
+                     float bias,
+                     bool bias_after_scale,
+                     pten::DenseTensor* dense_out) {
+  switch (kernel_dtype) {
+    case pten::DataType::FLOAT64: {
+      pten::ScaleKernel<double>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::FLOAT32: {
+      pten::ScaleKernel<float>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::FLOAT16: {
+      pten::ScaleKernel<paddle::platform::float16>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::INT64: {
+      pten::ScaleKernel<int64_t>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::INT32: {
+      pten::ScaleKernel<int32_t>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::INT16: {
+      pten::ScaleKernel<int16_t>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::INT8: {
+      pten::ScaleKernel<int8_t>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    case pten::DataType::UINT8: {
+      pten::ScaleKernel<uint8_t>(
+          dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
+      break;
+    }
+    default: {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Detected unsupported data type."
+          "Only Float64, Float32, Float16, Int64, Int32, Int16, Int8, UInt8 "
+          "are "
+          "supported for now."));
+      break;
+    }
+  }
+}
+#endif
+
+Tensor scale_switch_case(const Tensor& x,
+                         const Scalar& scale,
+                         float bias,
+                         bool bias_after_scale) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "scale", {kernel_backend, kernel_layout, kernel_data_type});
+  VLOG(6) << "scale API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  VLOG(6) << "scale API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+
+  auto out_meta = pten::UnchangedInferMeta(dense_x->meta());
+  auto dense_out = std::make_shared<pten::DenseTensor>(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          pten::TransToFluidPlace(kernel_backend)),
+      std::move(out_meta));
+
+  Tensor out;
+  out.set_impl(dense_out);
+
+  switch (kernel_backend) {
+    case Backend::CPU:
+      ScaleCPU(kernel_data_type,
+               static_cast<const pten::CPUContext&>(*dev_ctx),
+               *dense_x,
+               scale,
+               bias,
+               bias_after_scale,
+               dense_out.get());
+      break;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    case Backend::GPU:
+      ScaleGPU(kernel_data_type,
+               static_cast<const pten::GPUContext&>(*dev_ctx),
+               *dense_x,
+               scale,
+               bias,
+               bias_after_scale,
+               dense_out.get());
+      break;
+#endif
+    default:
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Detected unsupported backend."
+          "Only CPU and CUDA Backend are supported for now."
+          "Please double check if your backend falls into the above two "
+          "categories."));
+  }
+
+  return out;
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index e69de29bb2..fe26f56552 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/kernels/scale_kernel.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace pten {
+namespace tests {
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(DEV_API, scale) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 4}),
+                                                  pten::DataLayout::NCHW));
+
+  auto* dense_x_data = dense_x.mutable_data<float>();
+  for (size_t i = 0; i < 12; ++i) {
+    dense_x_data[i] = i * 1.0;
+  }
+  float scale = 2;
+  float bias = 1;
+  bool bias_after_scale = true;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Scale<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      scale,
+      bias,
+      bias_after_scale);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.numel(), 12);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = 23;
+  auto actual_result = out.data<float>()[11];
+  ASSERT_NEAR(expect_result, actual_result, 1e-6f);
+}
+
+TEST(DEV_API, scale_host) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 4}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+  for (size_t i = 0; i < 12; ++i) {
+    dense_x_data[i] = i * 1.0;
+  }
+  const auto alloc2 = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor scale(alloc2,
+                          pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                framework::make_ddim({1}),
+                                                pten::DataLayout::NCHW));
+  scale.mutable_data<float>()[0] = 2;
+  float bias = 1;
+  bool bias_after_scale = true;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Scale<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      scale,
+      bias,
+      bias_after_scale);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.numel(), 12);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = 23;
+  auto actual_result = out.data<float>()[11];
+  ASSERT_NEAR(expect_result, actual_result, 1e-6f);
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index e69de29bb2..e8539b11d1 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -0,0 +1,435 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+
+
+class API:
+    prefix_tensor_name = 'dense_'
+
+    def __init__(self, api_item_yaml):
+        self.api = api_item_yaml['api']
+        # args:
+        #   inputs: 
+        #     names : [], list of input names
+        #   attrs:
+        #     names : [], list of attribute names
+        #     attr_info : { attr_name : (type, default_values)}    
+        self.args = self.parse_args(api_item_yaml['args'])
+        self.output = api_item_yaml['output']
+        self.is_base_api = True
+        if 'invoke' in api_item_yaml:
+            self.is_base_api = False
+            self.invoke = api_item_yaml['invoke']
+        else:
+            self.kernel = api_item_yaml['kernel']
+            if 'backend' not in self.kernel or len(self.kernel['backend']) == 0:
+                self.kernel['backend'] = None
+            if 'layout' not in self.kernel or len(self.kernel['layout']) == 0:
+                self.kernel['layout'] = None
+            if 'data_type' not in self.kernel or len(self.kernel[
+                    'data_type']) == 0:
+                self.kernel['data_type'] = None
+            if 'param' not in self.kernel:
+                self.kernel['param'] = None
+
+            self.infer_meta = api_item_yaml['infer_meta']
+            if 'param' not in self.infer_meta:
+                self.infer_meta['param'] = None
+
+    def parse_args(self, args_str):
+        inputs = {'names': []}
+        attrs = {'names': [], 'attr_info': {}}
+        args_str = args_str.strip()
+        assert args_str.startswith('(') and args_str.endswith(')'), \
+            f"Args declaration should start with '(' and end with ')', please check the args of {self.api} in api.yaml."
+        args_str = args_str[1:-1]
+        args_list = args_str.split(',')
+        input_types = ['const Tensor&', 'const Tensor &']
+        attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \
+                      'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \
+                      'const std::vector<int64_t>&', 'Backend', 'DataLayout', 'DataType']
+        args_declare_str = ""
+        args_define_str = ""
+        for item in args_list:
+            item = item.strip()
+            # match the input tensor
+            has_input = False
+            for in_type in input_types:
+                if item.startswith(in_type):
+                    input_name = item[len(in_type):].strip()
+                    assert len(input_name) > 0, \
+                        f"The input tensor name should not be empty. Please check the args of {self.api} in api.yaml."
+                    inputs['names'].append(input_name)
+                    args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
+                    args_define_str = args_define_str + in_type + ' ' + input_name + ', '
+                    has_input = True
+                    break
+            if has_input:
+                continue
+
+            # match the attribute
+            for attr_type in attr_types:
+                if item.startswith(attr_type):
+                    attr_name = item[len(attr_type):].strip()
+                    assert len(attr_name) > 0, \
+                        f"The attribute name should not be empty. Please check the args of {self.api} in api.yaml."
+                    default_value = None
+                    if '=' in attr_name:
+                        attr_infos = attr_name.split('=')
+                        attr_name = attr_infos[0].strip()
+                        default_value = attr_infos[1].strip()
+
+                    default_value_str = "" if default_value is None else '=' + default_value
+                    args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
+                    args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
+                    attrs['names'].append(attr_name)
+                    attrs['attr_info'][attr_name] = (attr_type, default_value)
+                    break
+
+        args = {
+            'inputs': inputs,
+            'attrs': attrs,
+            'args_declare': args_declare_str[:-2],
+            'args_define': args_define_str[:-2]
+        }
+        return args
+
+    def gene_api_declaration(self):
+        return f"""
+PADDLE_API {self.output} {self.api}({self.args['args_declare']});
+"""
+
+    def gene_kernel_select(self, input_names, attrs, kernel):
+
+        kernel_key_item_init = """
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+"""
+        # Check the tensor options
+        attr_backend_count = 0
+        attr_layout_count = 0
+        attr_data_type_count = 0
+        for attr_name in attrs['names']:
+            if attrs['attr_info'][attr_name][0] == 'Backend':
+                assert kernel['backend'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
+                attr_backend_count = attr_backend_count + 1
+            if attrs['attr_info'][attr_name][0] == 'DataLayout':
+                assert kernel['layout'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'DataLayout' type in attributes, you must set layout of kernel manually."
+                attr_layout_count = attr_layout_count + 1
+            if attrs['attr_info'][attr_name][0] == 'DataType':
+                assert kernel['data_type'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'DataType' type in attributes, you must set data_type of kernel manually."
+                attr_data_type_count = attr_data_type_count + 1
+
+        # preprocess kernel configures
+        kernel_select_code = ""
+        if kernel['backend'] is not None:
+            if '>' in kernel['backend']:
+                vars_list = kernel['backend'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
+                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
+                    f"{self.api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                args_str = ""
+                for ele in kernel['backend'].split(','):
+                    args_str = args_str + ele.strip() + ', '
+                kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackend({args_str[:-2]});
+"""
+
+        if kernel['layout'] is not None:
+            if '>' in kernel['layout']:
+                vars_list = kernel['layout'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set layout with '>' only allows 2, but received {len(vars_list)}."
+                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataLayout', \
+                    f"{self.api} api: When use '>' to set kernel layout, the first param should be a attribute with DataLayout type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayoutWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                vars_list = kernel['layout'].split(',')
+                assert len(
+                    vars_list
+                ) == 1, f"{self.api} api: The number of params to set layout must be 1, but received {len(vars_list)}."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayout({vars_list[0].strip()});
+"""
+
+        if kernel['data_type'] is not None:
+            if '>' in kernel['data_type']:
+                vars_list = kernel['data_type'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set data_type with '>' only allows 2, but received {len(vars_list)}."
+                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataType', \
+                    f"{self.api} api: When use '>' to set kernel data_type, the first param should be a attribute with DataType type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataTypeWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                vars_list = kernel['data_type'].split(',')
+                assert len(
+                    vars_list
+                ) == 1, f"{self.api} api: The number of params to set data_type only allows 2, but received {len(vars_list)}."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataType({vars_list[0].strip()});
+"""
+
+        if len(input_names) == 0:
+            assert attr_backend_count > 0 and attr_layout_count > 0 and attr_data_type_count > 0, \
+                f"{self.api} api: When there is no input tensor, the args must have 'Backend', 'DataLayout' and 'DataType'."
+
+        kernel_select_args = ""
+        for input_name in input_names:
+            kernel_select_args = kernel_select_args + input_name + ", "
+
+        if len(kernel_select_args) > 2:
+            kernel_select_args = kernel_select_args[:-2]
+
+        kernel_select_code = kernel_key_item_init + kernel_select_code
+
+        if len(input_names) > 0:
+            kernel_select_code = kernel_select_code + f"""
+  if (kernel_backend == Backend::UNDEFINED 
+        || kernel_layout == DataLayout::UNDEFINED
+        || kernel_data_type == DataType::UNDEFINED ) {{
+    auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
+    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {{
+      kernel_backend = kernel_key.backend();
+    }}
+    if (kernel_layout == DataLayout::UNDEFINED) {{
+      kernel_layout = kernel_key.layout();
+    }}
+    if (kernel_data_type == DataType::UNDEFINED) {{
+      kernel_data_type = kernel_key.dtype();
+    }}
+  }}"""
+
+        kernel_select_code = kernel_select_code + f"""
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{kernel['func']}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{self.api} API kernel: " << kernel;"""
+
+        return kernel_select_code
+
+    def gene_infer_meta(self, input_names, attr_names, infer_meta) -> str:
+        infer_meta_params = infer_meta['param'] if infer_meta[
+            'param'] is not None else input_names + attr_names
+        param_code = ""
+        for param in infer_meta_params:
+            if param in input_names:
+                param_code = param_code + self.prefix_tensor_name + param + "->meta(), "
+            elif param in attr_names:
+                param_code = param_code + param + ", "
+            elif isinstance(param, str):
+                param_code = param_code + "\"" + param + "\", "
+            elif isinstance(param, bool):
+                param_code = param_code + str(param).lower() + ", "
+            else:
+                param_code = param_code + str(param) + ", "
+
+        param_code = param_code[:-2]
+        return f"""
+  auto out_meta = pten::{infer_meta['func']}({param_code});
+"""
+
+    def get_kernel_args(self, input_names, attrs, kernel_param):
+        input_tensor_code = ""
+        for input_name in input_names:
+            # set input code
+            input_tensor_code = input_tensor_code + f"""
+  auto {self.prefix_tensor_name}{input_name} = std::dynamic_pointer_cast<pten::DenseTensor>({input_name}.impl());"""
+
+        attr_names = attrs['names']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+
+        kernel_args = "*dev_ctx, "
+        for param in kernel_param:
+            if param in input_names:
+                kernel_args = kernel_args + "*" + self.prefix_tensor_name + param + ", "
+            elif param in attr_names:
+                # set attr for kernel_context
+                if 'ScalarArray' in attrs['attr_info'][param][0]:
+                    param = 'pten::ScalarArray(' + param + ')'
+                elif 'Scalar' in attrs['attr_info'][param][0]:
+                    param = 'pten::Scalar(' + param + ')'
+                kernel_args = kernel_args + param + ", "
+            elif isinstance(param, bool):
+                kernel_args = kernel_args + str(param).lower() + ", "
+            else:
+                kernel_args = kernel_args + str(param) + ", "
+        return input_tensor_code, kernel_args[:-2]
+
+    def gene_api_code(self):
+        if self.is_base_api:
+            input_tensors, kernel_args = self.get_kernel_args(
+                self.args['inputs']['names'], self.args['attrs'],
+                self.kernel['param'])
+            return f"""
+PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
+{self.gene_kernel_select(self.args['inputs']['names'], self.args['attrs'], self.kernel)}
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+{input_tensors}
+{self.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], self.infer_meta)}
+  auto dense_out = std::make_shared<pten::DenseTensor>(
+        pten::make_intrusive<paddle::experimental::SharedStorage>(
+            pten::TransToFluidPlace(kernel_backend)),
+        std::move(out_meta));
+
+  Tensor out;
+  out.set_impl(dense_out);
+
+  auto* kernel_fn = kernel.GetVariadicKernelFn<pten::{self.api}_kernel>();
+  (*kernel_fn)({kernel_args}, dense_out.get());
+
+  return out;
+}}
+"""
+
+        else:
+            return f"""
+PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
+  return {self.invoke};
+}}
+"""
+
+
+def header_include():
+    return """
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/common/scalar_array.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/include/kernel_signature.h"
+#include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/infermeta/binary.h"
+#include "paddle/pten/infermeta/multiary.h"
+#include "paddle/pten/infermeta/nullary.h"
+#include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/declarations.h"
+"""
+
+
+def api_register():
+    return """
+PT_REGISTER_API(Creation);
+PT_REGISTER_API(Linalg);
+PT_REGISTER_API(Manipulation);
+PT_REGISTER_API(Math);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+
+""", """
+
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/pten/api/include/api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for api in apis:
+        api_code = API(api)
+        print(api_code.gene_api_declaration())
+        header_file.write(api_code.gene_api_declaration())
+        source_file.write(api_code.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to yaml file directory',
+        default='python/paddle/utils/code_gen/api.yaml')
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/pten/api/include/api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/pten/api/lib/api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
-- 
Gitee