diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 0817dc336716211e3a538195f81d60ef3e0c5d08..7f44afabf2590c1ebfb0b2f95fa133d6bfc6548e 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/imperative/all_reduce.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/basic_engine.h"
@@ -51,6 +52,8 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
+PyTypeObject *g_varbase_pytype = nullptr;
+
 namespace py = ::pybind11;
 
 class Layer : public imperative::Layer {
@@ -133,30 +136,44 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
     return place_obj.cast<platform::XPUPlace>();
   } else if (py::isinstance<platform::CUDAPinnedPlace>(place_obj)) {
     return place_obj.cast<platform::CUDAPinnedPlace>();
+  } else if (py::isinstance<platform::NPUPlace>(place_obj)) {
+    return place_obj.cast<platform::NPUPlace>();
   } else if (py::isinstance<platform::Place>(place_obj)) {
     return place_obj.cast<platform::Place>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
-        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace"));
   }
 }
 
-static void InitTensorForVarBase(imperative::VarBase *self,
-                                 const py::array &array,
-                                 const platform::Place place,
-                                 bool persistable = false,
-                                 bool zero_copy = false, std::string name = "",
-                                 int stop_gradient = -1) {
-  if (name == "") {
-    name =
-        imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor");
-  }
-  VLOG(5) << "Init Tensor as: / name: " << name
-          << " / persistable: " << persistable << " / zero_copy: " << zero_copy
+// only initialize varbase, but not its tensor.
+static void InitVarBaseOnly(imperative::VarBase *self, const std::string &name,
+                            bool persistable = false, int stop_gradient = -1) {
+  auto name_ = name == ""
+                   ? imperative::GetCurrentTracer()->GenerateUniqueName(
+                         "generated_tensor")
+                   : name;
+
+  VLOG(5) << "Init Tensor as: / name: " << name_
+          << " / persistable: " << persistable
           << " / stop_gradient: " << stop_gradient;
-  new (self) imperative::VarBase(name);
+  new (self) imperative::VarBase(name_);
+  if (stop_gradient != -1) {
+    self->SetOverridedStopGradient(stop_gradient);
+  }
+  self->SetPersistable(persistable);
+  self->SetType(framework::proto::VarType::LOD_TENSOR);
+}
+
+// initialize varbase and its tensor.
+static void InitVarBaseAndTensor(
+    imperative::VarBase *self, const py::array &array,
+    const platform::Place &place, const std::string &name,
+    bool persistable = false, bool zero_copy = false, int stop_gradient = -1) {
+  InitVarBaseOnly(self, name, persistable, stop_gradient);
   auto *tensor = self->MutableVar()->GetMutable<framework::LoDTensor>();
+  VLOG(4) << "zero_copy: " << zero_copy;
   if (platform::is_cpu_place(place)) {
     SetTensorFromPyArray<platform::CPUPlace>(
         tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), zero_copy);
@@ -170,30 +187,23 @@ static void InitTensorForVarBase(imperative::VarBase *self,
     SetTensorFromPyArray<platform::CUDAPinnedPlace>(
         tensor, array, BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
         zero_copy);
+  } else if (platform::is_npu_place(place)) {
+    SetTensorFromPyArray<platform::NPUPlace>(
+        tensor, array, BOOST_GET_CONST(platform::NPUPlace, place), zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
-  }
-  if (stop_gradient != -1) {
-    self->SetOverridedStopGradient(stop_gradient);
+        "Place should be one of "
+        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace"));
   }
-  self->SetPersistable(persistable);
-  self->SetType(framework::proto::VarType::LOD_TENSOR);
   self->SetDataType(tensor->type());
 }
 
 static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self,
                                            const py::kwargs &kwargs) {
   VLOG(4) << "Init VarBase from kwargs: ";
-  PADDLE_ENFORCE_EQ(
-      kwargs.contains("value"), true,
-      platform::errors::NotFound(
-          "The kwargs used to create Varbase misses argument: value"));
   auto persistable = kwargs.contains("persistable")
                          ? kwargs["persistable"].cast<bool>()
                          : false;
-  auto array = kwargs.contains("value") ? kwargs["value"].cast<py::array>()
-                                        : py::array();
   auto zero_copy =
       kwargs.contains("zero_copy") ? kwargs["zero_copy"].cast<bool>() : false;
   auto name = kwargs.contains("name") ? kwargs["name"].cast<std::string>() : "";
@@ -201,10 +211,18 @@ static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self,
                            ? kwargs["stop_gradient"].cast<int>()
                            : -1;
   auto default_place = imperative::GetCurrentTracer()->ExpectedPlace();
-  auto place = kwargs.contains("place") ? PyObjectToPlace(kwargs["place"])
-                                        : default_place;
-  InitTensorForVarBase(self, array, place, persistable, zero_copy, name,
-                       stop_gradient);
+
+  if (kwargs.contains("value")) {
+    auto array = kwargs["value"].cast<py::array>();
+    // place is only used when array is given, otherwise, it is meaningless and
+    // ignored
+    auto place = kwargs.contains("place") ? PyObjectToPlace(kwargs["place"])
+                                          : default_place;
+    InitVarBaseAndTensor(self, array, place, name, persistable, zero_copy,
+                         stop_gradient);
+  } else {
+    InitVarBaseOnly(self, name, persistable, stop_gradient);
+  }
 }
 
 template <typename P>
@@ -239,11 +257,11 @@ static void InitVarBaseFromNumpyWithArgDefault(imperative::VarBase *self,
                                                const py::array &array) {
   auto place = imperative::GetCurrentTracer()->ExpectedPlace();
   VLOG(4) << "Init VarBase from numpy at " << place;
-  InitTensorForVarBase(self, array, place);
+  InitVarBaseAndTensor(self, array, place, "");
 }
 
 static void InitVarBaseFromTensorWithArgDefault(
-    imperative::VarBase *self, const framework::LoDTensor &tensor) {
+    imperative::VarBase *self, const framework::Tensor &tensor) {
   VLOG(4) << "Init VarBase";
   auto place = imperative::GetCurrentTracer()->ExpectedPlace();
   new (self) imperative::VarBase(
@@ -397,37 +415,45 @@ static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length,
   return 0;
 }
 
-static void ParseIndexingSlice(framework::LoDTensor *tensor, PyObject *_index,
-                               std::vector<int> *slice_axes,
-                               std::vector<int> *slice_starts,
-                               std::vector<int> *slice_ends,
-                               std::vector<int> *slice_strides,
-                               std::vector<int> *decrease_axis,
-                               std::vector<int> *infer_flags) {
-  // We allow indexing by Integers, Slices, and tuples of those
-  // types.
-  // Ellipsis and None are not supported yet.
+static void ParseIndexingSlice(
+    framework::LoDTensor *tensor, PyObject *_index,
+    std::vector<int> *slice_axes, std::vector<int> *slice_starts,
+    std::vector<int> *slice_ends, std::vector<int> *slice_strides,
+    std::vector<int> *decrease_axis, std::vector<int> *none_axes,
+    std::vector<int> *infer_flags, std::vector<int> *list_select_idxs,
+    bool *list_select_flag) {
+  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
+  // types, and list of Bool and Integers.
   // wrap to tuple
+
+  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
   PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
+  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
+    if (!PyTuple_Check(_index)) {
+      Py_DECREF(index);
+      VLOG(4) << "Call Py_DECREF";
+    }
+  });
   PADDLE_ENFORCE_EQ(
       tensor->IsInitialized(), true,
       platform::errors::InvalidArgument("tensor has not been initialized"));
   const auto &shape = tensor->dims();
   const int rank = shape.size();
   const int size = PyTuple_GET_SIZE(index);
-  PADDLE_ENFORCE_EQ(
-      size <= rank, true,
-      platform::errors::InvalidArgument(
-          "too many indices (%d) for tensor of dimension %d", size, rank));
+
+  // specified_dims is the number of dimensions which indexed by Interger,
+  // Slices.
+  int specified_dims = 0;
   for (int dim = 0; dim < size; ++dim) {
     PyObject *slice_item = PyTuple_GetItem(index, dim);
-    PADDLE_ENFORCE_EQ(PyCheckInteger(slice_item) || PySlice_Check(slice_item),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Currently, VarBase.__getitem__() only allows "
-                          "indexing by Integers, Slices, and tuples of "
-                          "these types, but received %s in %dth slice item",
-                          std::string(Py_TYPE(slice_item)->tp_name), dim + 1));
+    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
+      specified_dims++;
+    }
+  }
+
+  for (int i = 0, dim = 0; i < size; ++i) {
+    PyObject *slice_item = PyTuple_GetItem(index, i);
+
     infer_flags->push_back(1);
     int dim_len = shape[dim];
     if (PyCheckInteger(slice_item)) {
@@ -450,7 +476,8 @@ static void ParseIndexingSlice(framework::LoDTensor *tensor, PyObject *_index,
       slice_ends->push_back(start + 1);
       slice_strides->push_back(1);
       decrease_axis->push_back(dim);
-    } else {
+      dim++;
+    } else if (PySlice_Check(slice_item)) {
       // slice item
       Py_ssize_t start, end, step;
       PySliceObject *p = reinterpret_cast<PySliceObject *>(slice_item);
@@ -458,15 +485,137 @@ static void ParseIndexingSlice(framework::LoDTensor *tensor, PyObject *_index,
 
       // :: or : or 0:dim_len:1
       if (start == 0 && end == dim_len && step == 1) {
+        dim++;
         continue;
       }
       slice_axes->push_back(dim);
       slice_starts->push_back(start);
       slice_ends->push_back(end);
       slice_strides->push_back(step);
+      dim++;
+    } else if (slice_item == Py_Ellipsis) {
+      dim += rank - specified_dims;
+    } else if (slice_item == Py_None) {
+      none_axes->push_back(dim);
+    } else if (PyList_Check(slice_item)) {
+      *list_select_flag = true;
+      PADDLE_ENFORCE_EQ(
+          size, 1,
+          platform::errors::InvalidArgument(
+              "When index contains a list, its length is excepted to 1, "
+              "but received %d",
+              size));
+      bool all_bool = true;
+      int list_size = PyList_GET_SIZE(slice_item);
+      for (int j = 0; j < list_size; ++j) {
+        PyObject *list_item = PyList_GetItem(slice_item, j);
+        if (PyCheckInteger(list_item)) {
+          all_bool = false;
+        } else if (!PyBool_Check(list_item)) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support int or bool in index list."));
+        }
+      }
+      if (all_bool) {
+        PADDLE_ENFORCE_EQ(
+            list_size, shape[0],
+            platform::errors::InvalidArgument(
+                "The dimension of bool index doesn't match indexed array along "
+                "dimension 0, the target dimension is %d, but received %d.",
+                shape[0], list_size));
+
+        for (int j = 0; j < list_size; ++j) {
+          PyObject *list_item = PyList_GetItem(slice_item, j);
+          if (list_item == Py_True) {
+            list_select_idxs->push_back(j);
+          }
+        }
+      } else {
+        for (int j = 0; j < list_size; ++j) {
+          PyObject *list_item = PyList_GetItem(slice_item, j);
+          if (PyCheckInteger(list_item)) {
+            list_select_idxs->push_back(
+                static_cast<int>(PyLong_AsLong(list_item)));
+          } else if (list_item == Py_True) {
+            list_select_idxs->push_back(1);
+          } else {
+            list_select_idxs->push_back(0);
+          }
+        }
+      }
+
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, VarBase.__getitem__() only allows indexing "
+          "by Integers, Slices, Ellipsis, None, tuples of these types "
+          "and list of Bool and Integers, but received "
+          "%s in %dth slice item",
+          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
     }
   }
-  if (!PyTuple_Check(_index)) Py_DecRef(index);
+
+  // valid_index is the number of dimensions exclude None index
+  const int valid_indexs = size - none_axes->size();
+  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
+                    platform::errors::InvalidArgument(
+                        "Too many indices (%d) for tensor of dimension %d.",
+                        valid_indexs, rank));
+}
+
+template <typename P>
+static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
+                        imperative::VarBase &dst,                   // NOLINT
+                        const P &dst_device, const bool blocking) {
+  if (dst.SharedVar()->IsEmpty()) {
+    VLOG(3) << "deep copy Variable from " << src->Name() << " to "
+            << dst.Name();
+    dst.SetPersistable(src->Persistable());
+    dst.SetDataType(src->DataType());
+    dst.SetType(src->Type());
+    dst.SetOverridedStopGradient(src->OverridedStopGradient());
+    if (!src->SharedVar()->IsEmpty()) {
+      if (src->Var().IsType<framework::LoDTensor>()) {
+        auto &src_tensor = src->Var().Get<framework::LoDTensor>();
+        auto *dst_tensor = dst.MutableVar()->GetMutable<framework::LoDTensor>();
+        dst_tensor->set_lod(src_tensor.lod());
+        framework::TensorCopy(src_tensor, dst_device, dst_tensor);
+        if (blocking) {
+          platform::DeviceContextPool::Instance().Get(dst_device)->Wait();
+          auto src_device = src_tensor.place();
+          if (!(src_device == dst_device)) {
+            platform::DeviceContextPool::Instance().Get(src_device)->Wait();
+          }
+        }
+      } else if (src->Var().IsType<framework::SelectedRows>()) {
+        auto &src_selected_rows = src->Var().Get<framework::SelectedRows>();
+        auto *dst_selected_rows =
+            dst.MutableVar()->GetMutable<framework::SelectedRows>();
+        dst_selected_rows->set_height(src_selected_rows.height());
+        dst_selected_rows->set_rows(src_selected_rows.rows());
+        framework::TensorCopy(src_selected_rows.value(), dst_device,
+                              dst_selected_rows->mutable_value());
+        if (blocking) {
+          platform::DeviceContextPool::Instance().Get(dst_device)->Wait();
+          auto src_device = src_selected_rows.value().place();
+          if (!(src_device == dst_device)) {
+            platform::DeviceContextPool::Instance().Get(src_device)->Wait();
+          }
+        }
+      }
+
+      if (!blocking) {
+        IncreaseVarbaseReferenceCountUntilCopyComplete(src, dst_device);
+      }
+
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The source Tensor(%s) can not copy when it is empty.", src->Name()));
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The destion Tensor(%s) can not copy when it is not empty.",
+        dst.Name()));
+  }
 }
 
 // Bind Methods
@@ -611,9 +760,10 @@ void BindImperative(py::module *m_ptr) {
           imperative::SetCurrentTracer(tracer);
         });
 
-  py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
-      m, "VarBase", R"DOC()DOC")
-      .def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
+  py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>> varbase(
+      m, "VarBase", R"DOC()DOC");
+  g_varbase_pytype = (PyTypeObject *)varbase.ptr();  // NOLINT
+  varbase.def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
       .def("__init__",
            [](imperative::VarBase &self) {
              std::string name =
@@ -659,17 +809,31 @@ void BindImperative(py::module *m_ptr) {
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
            py::arg("zero_copy") = false, py::arg("name") = "",
            py::arg("stop_gradient") = -1)
+      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::NPUPlace>,
+           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value"))
       .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"))
       .def("__init__", &InitVarBaseFromNumpyWithKwargs)
-      .def("__setitem__",
+      .def("__setitem_varbase__",
            [](std::shared_ptr<imperative::VarBase> &self, py::handle _index,
               py::object &value_obj) {
+             VLOG(4) << "Call __setitem_varbase__";
+
              auto self_tensor =
                  self->MutableVar()->GetMutable<framework::LoDTensor>();
+             // NOTE(zhiqiu): PyTuple_Pack increases refcount while PyTuple_New
+             // https://github.com/python/cpython/blob/24b63c695ae0a95b06379eaadace66735abac1e2/Objects/tupleobject.c#L251
              PyObject *index_ptr = !PyTuple_Check(_index.ptr())
                                        ? PyTuple_Pack(1, _index.ptr())
                                        : _index.ptr();
+             DEFINE_PADDLE_SCOPE_GUARD([index_ptr, &_index]() {
+               if (!PyTuple_Check(_index.ptr())) {
+                 Py_DECREF(index_ptr);
+                 VLOG(4) << "Call Py_DECREF";
+               }
+             });
              // 1. Check argumnets
              // 1.1 Check whether value obj is a tensor.
              bool value_is_tensor = true;
@@ -680,11 +844,24 @@ void BindImperative(py::module *m_ptr) {
                value_is_tensor = false;
              }
 
+             auto is_tensor = [](py::handle var) {
+               if (!var.ptr() || var.ptr() == Py_None) {
+                 return false;
+               }
+               try {
+                 py::cast<std::shared_ptr<imperative::VarBase>>(var);
+                 return true;
+               } catch (py::cast_error &) {
+                 return false;
+               }
+             };
+
              // 1.2 Check whether _index can be parsed.
              const int size = PyTuple_GET_SIZE(index_ptr);
              for (int dim = 0; dim < size; ++dim) {
                PyObject *slice_item = PyTuple_GetItem(index_ptr, dim);
-               if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item))) {
+               if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item) ||
+                     slice_item == Py_Ellipsis || slice_item == Py_None)) {
                  parse_index = false;
                  break;
                }
@@ -696,20 +873,32 @@ void BindImperative(py::module *m_ptr) {
              // copys data to cpu place, which reduces performance.
              if (parse_index && value_is_tensor) {
                std::vector<int> axes, starts, ends, steps, decrease_axes,
-                   infer_flags;
+                   none_axes, infer_flags, list_select_idxs;
+               // if index is a list, list_select_flag will be true
+               bool list_select_flag;
                ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends,
-                                  &steps, &decrease_axes, &infer_flags);
+                                  &steps, &decrease_axes, &none_axes,
+                                  &infer_flags, &list_select_idxs,
+                                  &list_select_flag);
 
                framework::AttributeMap attrs = {
                    {"axes", axes},
                    {"starts", starts},
                    {"ends", ends},
                    {"steps", steps},
-                   {"decrease_axes", decrease_axes}};
+                   {"decrease_axes", decrease_axes},
+                   {"none_axes", none_axes}};
 
                imperative::NameVarBaseMap ins = {{"Input", {self}}};
                imperative::NameVarBaseMap outs = {{"Out", {self}}};
 
+               PADDLE_ENFORCE_EQ(
+                   self->IsLeaf() && !self->OverridedStopGradient(), false,
+                   platform::errors::InvalidArgument(
+                       "Leaf Tensor (%s) that doesn't stop gradient can't use "
+                       "inplace strategy.",
+                       self->Name()));
+
                auto value_tensor =
                    value_obj.cast<std::shared_ptr<imperative::VarBase>>();
                ins.insert({"ValueTensor", {value_tensor}});
@@ -718,24 +907,48 @@ void BindImperative(py::module *m_ptr) {
                {
                  // Release gil and do tracing
                  py::gil_scoped_release release;
-                 tracer->TraceOp("set_value", ins, outs, std::move(attrs));
+                 tracer->TraceOp("set_value", ins, outs, std::move(attrs),
+                                 {{"Input", "Out"}});
                }
              } else {
                auto self_numpy = TensorToPyArray(*self_tensor);
+               VLOG(4) << "parse_index is false";
 
                if (value_is_tensor) {
+                 VLOG(4) << "value is tensor";
                  auto value =
                      value_obj.cast<std::shared_ptr<imperative::VarBase>>();
                  auto value_tensor =
                      value->MutableVar()->GetMutable<framework::LoDTensor>();
                  auto value_numpy = TensorToPyArray(*value_tensor);
-
-                 self_numpy[_index] = value_numpy;
+                 if (is_tensor(_index)) {
+                   VLOG(4) << "index is tensor";
+                   auto index_var =
+                       py::cast<std::shared_ptr<imperative::VarBase>>(_index);
+                   auto index_tensor = index_var->MutableVar()
+                                           ->GetMutable<framework::LoDTensor>();
+                   auto index_numpy = TensorToPyArray(*index_tensor);
+                   self_numpy[index_numpy] = value_numpy;
+                 } else {
+                   VLOG(4) << "index is not tensor";
+                   self_numpy[_index] = value_numpy;
+                 }
                  SetTensorFromPyArray(self_tensor, self_numpy,
                                       self_tensor->place(), true);
                } else {
-                 auto value_numpy = value_obj;
-                 self_numpy[_index] = value_numpy;
+                 VLOG(4) << "value is not tensor";
+                 if (is_tensor(_index)) {
+                   VLOG(4) << "index is tensor";
+                   auto index_var =
+                       py::cast<std::shared_ptr<imperative::VarBase>>(_index);
+                   auto index_tensor = index_var->MutableVar()
+                                           ->GetMutable<framework::LoDTensor>();
+                   auto index_numpy = TensorToPyArray(*index_tensor);
+                   self_numpy[index_numpy] = value_obj;
+                 } else {
+                   VLOG(4) << "index is not tensor";
+                   self_numpy[_index] = value_obj;
+                 }
                  SetTensorFromPyArray(self_tensor, self_numpy,
                                       self_tensor->place(), true);
                }
@@ -745,21 +958,31 @@ void BindImperative(py::module *m_ptr) {
              // inplace operator for the VarBase self.
              self->BumpInplaceVersion();
            })
-      .def("__getitem__",
+      .def("_getitem_index_not_tensor",
            [](std::shared_ptr<imperative::VarBase> &self, py::handle _index) {
+             VLOG(4) << "Call _getitem_index_not_tensor";
              std::vector<int> slice_axes, slice_starts, slice_ends,
-                 slice_strides, decrease_axis, infer_flags;
+                 slice_strides, decrease_axis, none_axes, infer_flags,
+                 list_select_idxs;
+             // if index is a list, list_select_flag will be true
+             bool list_select_flag = false;
              auto tensor =
                  self->MutableVar()->GetMutable<framework::LoDTensor>();
              ParseIndexingSlice(tensor, _index.ptr(), &slice_axes,
                                 &slice_starts, &slice_ends, &slice_strides,
-                                &decrease_axis, &infer_flags);
+                                &decrease_axis, &none_axes, &infer_flags,
+                                &list_select_idxs, &list_select_flag);
              // release gil and do tracing
              py::gil_scoped_release release;
              const auto &tracer = imperative::GetCurrentTracer();
-             if (slice_axes.empty()) {
-               return self;
-             } else {
+
+             auto out = slice_axes.empty() && !list_select_flag
+                            ? self
+                            : std::shared_ptr<imperative::VarBase>(
+                                  new imperative::VarBase(
+                                      tracer->GenerateUniqueName()));
+
+             if (!slice_axes.empty()) {
                imperative::NameVarBaseMap ins = {{"Input", {self}}};
                framework::AttributeMap attrs = {
                    {"axes", slice_axes},
@@ -767,8 +990,6 @@ void BindImperative(py::module *m_ptr) {
                    {"ends", slice_ends},
                    {"infer_flags", infer_flags},
                    {"decrease_axis", decrease_axis}};
-               auto out = std::shared_ptr<imperative::VarBase>(
-                   new imperative::VarBase(tracer->GenerateUniqueName()));
                imperative::NameVarBaseMap outs = {{"Out", {out}}};
                std::string op_type = "slice";
                for (auto stride : slice_strides) {
@@ -780,9 +1001,154 @@ void BindImperative(py::module *m_ptr) {
                  }
                }
                tracer->TraceOp(op_type, ins, outs, std::move(attrs));
-               return out;
              }
+             if (!none_axes.empty()) {
+               // Deal with cases when all axes are decreased.
+               // After slice, the shape of out is [1], which should have been
+               // [], but Paddle doesn't support scalar.
+               // In order to ensure the correctness of the final shape of out,
+               // one dimension of out needs to be decreased.
+               // For example:
+               // # x.shape: (2,3,4)
+               // out = x[0, 1, 1, None] # out.shape : (1)
+               if (static_cast<int>(decrease_axis.size()) ==
+                   tensor->dims().size()) {
+                 none_axes.pop_back();
+               }
+               if (!none_axes.empty()) {
+                 // Deal with cases that decrease_axes is not empty
+                 // For example:
+                 // # x.shape: (2,3,4)
+                 // out = x[0, 0:2, None] # out.shape : (2, 1, 4)
+                 for (auto &axis : none_axes) {
+                   int len = 0;
+                   for (int da : decrease_axis) {
+                     if (da < axis) {
+                       len++;
+                     }
+                   }
+                   axis -= len;
+                 }
+
+                 // Deal with cases that there are more than one
+                 // prefix none index, For example:
+                 // [None, None, :, :, None]
+                 // the none_axes int the return of ParseIndexingSlice is:
+                 // [0,    0,          2   ]
+                 // according to the interface of "unsqueeze2",
+                 // we should convert it to:
+                 // [0,    0,          4   ]
+                 int prefix_zero_cnt = 0;
+                 for (const auto &axis : none_axes) {
+                   if (axis == 0) {
+                     prefix_zero_cnt++;
+                   } else {
+                     break;
+                   }
+                 }
+                 if (prefix_zero_cnt > 0) {
+                   int none_axes_num = static_cast<int>(none_axes.size());
+                   for (int i = prefix_zero_cnt; i < none_axes_num; ++i) {
+                     none_axes[i] += prefix_zero_cnt;
+                   }
+                 }
+
+                 imperative::NameVarBaseMap ins = {{"X", {out}}};
+                 framework::AttributeMap attrs = {{"axes", none_axes}};
+                 auto new_out = std::shared_ptr<imperative::VarBase>(
+                     new imperative::VarBase(tracer->GenerateUniqueName()));
+                 auto out_xshape = std::shared_ptr<imperative::VarBase>(
+                     new imperative::VarBase(tracer->GenerateUniqueName()));
+                 imperative::NameVarBaseMap outs = {{"Out", {new_out}},
+                                                    {"XShape", {out_xshape}}};
+                 tracer->TraceOp("unsqueeze2", ins, outs, std::move(attrs));
+
+                 return new_out;
+               }
+             }
+
+             // the index is a list
+             if (list_select_flag) {
+               auto select_index = std::shared_ptr<imperative::VarBase>(
+                   new imperative::VarBase(tracer->GenerateUniqueName()));
+               auto *idx_tensor = select_index->MutableVar()
+                                      ->GetMutable<framework::LoDTensor>();
+               auto *dev_ctx = platform::DeviceContextPool::Instance().Get(
+                   tracer->ExpectedPlace());
+               TensorFromVector(list_select_idxs, *dev_ctx, idx_tensor);
+
+               imperative::NameVarBaseMap ins = {{"X", {self}},
+                                                 {"Index", {select_index}}};
+               imperative::NameVarBaseMap outs = {{"Out", {out}}};
+               tracer->TraceOp("index_select", ins, outs, {{"dim", 0}});
+             }
+
+             return out;
            })
+      .def(
+          "_getitem_from_offset",
+          [](std::shared_ptr<imperative::VarBase> &self, const py::args &args) {
+            const auto &tensor = self->Var().Get<framework::LoDTensor>();
+            PADDLE_ENFORCE_EQ(
+                tensor.IsInitialized(), true,
+                platform::errors::InvalidArgument(
+                    "Tensor of %s is Empty, please check if it has no data.",
+                    self->Name()));
+
+            const auto &tensor_dims = tensor.dims();
+
+            std::vector<size_t> dims(tensor_dims.size());
+            std::vector<size_t> strides(tensor_dims.size());
+
+            size_t numel = 1;
+            for (int i = tensor_dims.size() - 1; i >= 0; --i) {
+              strides[i] = numel;
+              dims[i] = static_cast<size_t>(tensor_dims[i]);
+              numel *= dims[i];
+            }
+            size_t offset = 0;
+            if (args.empty()) {
+              PADDLE_ENFORCE_EQ(
+                  numel, 1,
+                  platform::errors::InvalidArgument(
+                      "only one element tensors can be converted to Python "
+                      "scalars when no input coordinates"));
+            } else if (args.size() == 1) {
+              offset = args[0].cast<size_t>();
+              PADDLE_ENFORCE_LT(
+                  offset, numel,
+                  platform::errors::InvalidArgument(
+                      "index %d is out of bounds for size %d", offset, numel));
+            } else {
+              PADDLE_ENFORCE_EQ(args.size(), dims.size(),
+                                platform::errors::InvalidArgument(
+                                    "incorrect number of indices for Tensor"));
+
+              for (size_t i = 0; i < args.size(); ++i) {
+                size_t index = args[i].cast<size_t>();
+                PADDLE_ENFORCE_LT(
+                    index, dims[i],
+                    platform::errors::InvalidArgument(
+                        "index %d is out fo bounds for axis %d with size %d",
+                        index, i, dims[i]));
+                offset += index * strides[i];
+              }
+            }
+#define TENSOR_TO_PY_SCALAR(T, proto_type)                                   \
+  if (tensor.type() == proto_type) {                                         \
+    std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(proto_type); \
+    T b = TensorGetElement<T>(tensor, offset);                               \
+    return py::array(py::dtype(py_dtype_str.c_str()), {}, {},                \
+                     static_cast<void *>(&b));                               \
+  }
+
+            _ForEachDataType_(TENSOR_TO_PY_SCALAR);
+#undef TENSOR_TO_PY_SCALAR
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported tensor data type: %s",
+                framework::DataTypeToString(tensor.type())));
+          },
+          py::return_value_policy::copy)
       .def("_inplace_version",
            [](imperative::VarBase &self) -> uint32_t {
              const auto &var = self.MutableVar();
@@ -1182,20 +1548,26 @@ void BindImperative(py::module *m_ptr) {
 
       )DOC")
       .def("cuda",
-           [](const std::shared_ptr<imperative::VarBase> &self, int device_id,
-              bool blocking) {
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              py::handle &handle, bool blocking) {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Cannot copy this Tensor to GPU in CPU version Paddle, "
                  "Please recompile or reinstall Paddle with CUDA support."));
 #else
              int device_count = platform::GetCUDADeviceCount();
-             if (device_id == -1) {
+             int device_id = 0;
+             if (handle == py::none()) {
                if (platform::is_gpu_place(self->Place())) {
                  return self;
-               } else {
-                 device_id = 0;
                }
+             } else {
+               PyObject *py_obj = handle.ptr();
+               PADDLE_ENFORCE_EQ(
+                   PyCheckInteger(py_obj), true,
+                   platform::errors::InvalidArgument(
+                       " 'device_id' must be a positive integer"));
+               device_id = py::cast<int>(handle);
              }
              PADDLE_ENFORCE_GE(
                  device_id, 0,
@@ -1219,26 +1591,30 @@ void BindImperative(py::module *m_ptr) {
              }
 #endif
            },
-           py::arg("device_id") = -1, py::arg("blocking") = true, R"DOC(
+           py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC(
         Returns a copy of this Tensor in GPU memory.
 
         If this Tensor is already in GPU memory and device_id is default, 
         then no copy is performed and the original Tensor is returned.
         
         Args:
-            device_id(int, optional): The destination GPU device id. Defaults to the current device.
+            device_id(int, optional): The destination GPU device id. Default: None, means current device.
             blocking(bool, optional): If False and the source is in pinned memory, the copy will be 
               asynchronous with respect to the host. Otherwise, the argument has no effect. Default: False.
 
         Examples:
             .. code-block:: python
 
+              # required: gpu
               import paddle
               x = paddle.to_tensor(1.0, place=paddle.CPUPlace())
               print(x.place)        # CPUPlace
 
               y = x.cuda()
               print(y.place)        # CUDAPlace(0)
+            
+              y = x.cuda(None)
+              print(y.place)        # CUDAPlace(0)
 
               y = x.cuda(1)
               print(y.place)        # CUDAPlace(1)
@@ -1321,6 +1697,16 @@ void BindImperative(py::module *m_ptr) {
              return new_var;
            },
            py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::NPUPlace &place, bool blocking) {
+             auto new_var = self->NewVarBase(place, blocking);
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
+           py::return_value_policy::copy)
       .def("_copy_to",
            [](const std::shared_ptr<imperative::VarBase> &self,
               const platform::Place &place, bool blocking) {
@@ -1340,28 +1726,22 @@ void BindImperative(py::module *m_ptr) {
                     &imperative::VarBase::SetOverridedStopGradient)
       .def_property("persistable", &imperative::VarBase::Persistable,
                     &imperative::VarBase::SetPersistable)
-      .def_property_readonly("shape",
-                             [](imperative::VarBase &self) {
-                               if (self.Var().IsType<framework::LoDTensor>()) {
-                                 return framework::vectorize<int>(
-                                     self.Var()
-                                         .Get<framework::LoDTensor>()
-                                         .dims());
-                               } else if (self.Var()
-                                              .IsType<
-                                                  framework::SelectedRows>()) {
-                                 return framework::vectorize<int>(
-                                     self.Var()
-                                         .Get<framework::SelectedRows>()
-                                         .value()
-                                         .dims());
-                               } else {
-                                 VLOG(2) << "It is meaningless to get shape of "
-                                            "variable type "
-                                         << GetTypeName(self);
-                                 return std::vector<int>();
-                               }
-                             })
+      .def_property_readonly(
+          "shape",
+          [](imperative::VarBase &self) {
+            if (self.Var().IsType<framework::LoDTensor>()) {
+              return framework::vectorize<int>(
+                  self.Var().Get<framework::LoDTensor>().dims());
+            } else if (self.Var().IsType<framework::SelectedRows>()) {
+              return framework::vectorize<int>(
+                  self.Var().Get<framework::SelectedRows>().value().dims());
+            } else {
+              VLOG(2) << "It is meaningless to get shape of "
+                         "variable type "
+                      << GetTypeName(self);
+              return std::vector<int>();
+            }
+          })
       .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
                              R"DOC(
       Whether a Tensor is leaf Tensor.
@@ -1453,6 +1833,11 @@ void BindImperative(py::module *m_ptr) {
               self.SetExpectedPlace(*p);
               VLOG(4) << "Tracer(" << &self << ")"
                       << " set expected place " << *p;
+            } else if (py::isinstance<platform::NPUPlace>(obj)) {
+              auto p = obj.cast<platform::NPUPlace *>();
+              self.SetExpectedPlace(*p);
+              VLOG(4) << "Tracer(" << &self << ")"
+                      << " set expected place " << *p;
             } else if (py::isinstance<platform::Place>(obj)) {
               auto p = obj.cast<platform::Place *>();
               self.SetExpectedPlace(*p);
@@ -1461,7 +1846,7 @@ void BindImperative(py::module *m_ptr) {
             } else {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Incompatible Place Type: supports XPUPlace, CUDAPlace, "
-                  "CPUPlace, "
+                  "CPUPlace, NPUPlace"
                   "and CUDAPinnedPlace, "
                   "but got Unknown Type!"));
             }
@@ -1487,7 +1872,7 @@ void BindImperative(py::module *m_ptr) {
                  allow_ops);
              imperative::AmpOperators::Instance().GetMutableBlockOps()->swap(
                  block_ops);
-             VLOG(4) << "AMP operators changed, "
+             VLOG(5) << "AMP operators changed, "
                      << imperative::AmpOperators::Instance();
            })
       .def("_get_amp_op_list",
@@ -1522,6 +1907,19 @@ void BindImperative(py::module *m_ptr) {
                             std::move(attrs), place, trace_backward);
              }
            })
+      .def("trace",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs, const platform::NPUPlace &place,
+              bool trace_backward) {
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               py::gil_scoped_release release;
+               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
+                            std::move(attrs), place, trace_backward);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
@@ -1574,6 +1972,13 @@ void BindImperative(py::module *m_ptr) {
             self.nrings_ = nrings;
           });
 
+  m.def("varbase_copy", &VarBaseCopy<platform::Place>);
+  m.def("varbase_copy", &VarBaseCopy<platform::CPUPlace>);
+  m.def("varbase_copy", &VarBaseCopy<platform::CUDAPlace>);
+  m.def("varbase_copy", &VarBaseCopy<platform::XPUPlace>);
+  m.def("varbase_copy", &VarBaseCopy<platform::CUDAPinnedPlace>);
+  m.def("varbase_copy", &VarBaseCopy<platform::NPUPlace>);
+
   m.def(
       "dygraph_partial_grad",
       [](const std::vector<std::shared_ptr<imperative::VarBase>> &input_targets,
@@ -1673,6 +2078,12 @@ void BindImperative(py::module *m_ptr) {
            const py::args args, const py::kwargs kwargs) {
           return imperative::PyLayerApply(place, cls, args, kwargs);
         });
+
+  m.def("pylayer_apply",
+        [](const platform::NPUPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
 }
 
 }  // namespace pybind
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 64209aee875ba02364481674f796a0b519f771f5..c42a2a5943d11a33b1dc923a9361324c7956fbc7 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -16,16 +16,18 @@ import inspect
 import numpy as np
 import warnings
 import weakref
+import sys
 
 import paddle
 from .. import framework
 from .. import core
 from .. import unique_name
-from ..framework import Variable, Parameter, ParamBase
+from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_varbase
 from .parallel import scale_loss
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
+import paddle.utils.deprecated as deprecated
 
 
 class TensorHookRemoveHelper(object):
@@ -85,7 +87,7 @@ def monkey_patch_varbase():
 
         """
 
-        # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph. 
+        # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery in dygraph only, should not let it getattr(self, attr, None).
         attr_not_need_keys = ['grad']
         if isinstance(self, ParamBase):
@@ -107,6 +109,8 @@ def monkey_patch_varbase():
 
         if to_parameter or isinstance(self, ParamBase):
             del attr_kwargs['persistable']
+            # NOTE(Aurelius84): All parameters should be placed into global block.
+            attr_kwargs['block'] = attr_kwargs['block'].program.global_block()
             static_var = Parameter(**attr_kwargs)
         else:
             static_var = Variable(**attr_kwargs)
@@ -238,8 +242,17 @@ def monkey_patch_varbase():
                 "Variable.backward() is only available in DyGraph mode")
 
     @framework.dygraph_only
+    @deprecated(
+        since="2.1.0",
+        level=1,
+        reason="Please use tensor.grad, which returns the tensor value of the gradient."
+    )
     def gradient(self):
         """
+        .. warning::
+          This API will be deprecated in the future, it is recommended to use
+          :code:`x.grad` which returns the tensor value of the gradient.
+
         Get the Gradient of Current Tensor.
 
         Returns:
@@ -253,7 +266,7 @@ def monkey_patch_varbase():
                 x = paddle.to_tensor(5., stop_gradient=False)
                 y = paddle.pow(x, 4.0)
                 y.backward()
-                print("grad of x: {}".format(x.grad))
+                print("grad of x: {}".format(x.gradient()))
                 # [500.]
 
         """
@@ -337,10 +350,37 @@ def monkey_patch_varbase():
     @property
     def grad(self):
         """
-        The alias of gradient().
-        """
+        .. warning::
+          This API will return the tensor value of the gradient. If you want 
+          to get the numpy value of the gradient, you can use :code:`x.grad.numpy()`.
+
+        Get the Gradient of Current Tensor.
+
+        Returns:
+            Tensor: the gradient of current Tensor
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                x = paddle.to_tensor(5., stop_gradient=False)
+                y = paddle.pow(x, 4.0)
+                y.backward()
+                print("grad of x: {}".format(x.grad))
+                # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, [500.])
 
-        return self.gradient()
+        """
+        msg = 'tensor.grad will return the tensor value of the gradient.' \
+            ' This is an incompatible upgrade for tensor.grad API. ' \
+            ' It\'s return type changes from numpy.ndarray in version 2.0 to paddle.Tensor in version 2.1.0. ' \
+            ' If you want to get the numpy value of the gradient, you can use :code:`x.grad.numpy()`'
+        warning_msg = "\033[93m\nWarning:\n%s \033[0m" % (msg)
+        # ensure ANSI escape sequences print correctly in cmd and powershell
+        if sys.platform.lower() == 'win32':
+            warning_msg = "\nWarning:\n%s " % (msg)
+        warnings.warn(warning_msg)
+        return self._grad_ivar()
 
     def clear_grad(self):
         """
@@ -348,6 +388,49 @@ def monkey_patch_varbase():
         """
         self.clear_gradient()
 
+    def item(self, *args):
+        """
+        Convert one element Tensor to a Python scalar.
+
+        Args:
+            *args(int): The input coordinates. If it's single int, the data in the corresponding order of flattened Tensor will be returned.
+                Default: None, and it must be in the case where Tensor has only one element.
+
+        Returns(Python scalar): A Python scalar, whose dtype is corresponds to the dtype of Tensor.
+
+        Raises:
+            ValueError: If the Tensor has more than one element, there must be coordinates.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                x = paddle.to_tensor(1)
+                print(x.item())             #1
+                print(type(x.item()))       #<class 'int'>
+
+                x = paddle.to_tensor(1.0)
+                print(x.item())             #1.0
+                print(type(x.item()))       #<class 'float'>
+
+                x = paddle.to_tensor(True)
+                print(x.item())             #True
+                print(type(x.item()))       #<class 'bool'>
+
+                x = paddle.to_tensor(1+1j)
+                print(x.item())             #(1+1j)
+                print(type(x.item()))       #<class 'complex'>
+
+                x = paddle.to_tensor([[1.1, 2.2, 3.3]])
+                print(x.item(2))            #3.3
+                print(x.item(0, 2))         #3.3
+
+                x = paddle.to_tensor([1, 2])
+                x.item()               #ValueError: only one element tensor can be converted to Python scalar when no input coordinates.
+        """
+        return self._getitem_from_offset(*args).item()
+
     @property
     def inplace_version(self):
         """
@@ -435,7 +518,95 @@ def monkey_patch_varbase():
         return self.__nonzero__()
 
     def __array__(self, dtype=None):
-        return self.numpy().astype(dtype)
+        """
+        Returns a numpy array shows the value of current Tensor.
+        
+        Returns:
+            ndarray: The numpy value of current Tensor.
+
+        Returns type:
+            ndarray: dtype is same as current Tensor
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                x = paddle.randn([2, 2])
+                x_array = np.array(x)
+
+                print(type(x_array))      #<class 'numpy.ndarray'>
+                print(x_array.shape)      #(2, 2)
+        """
+        array = self.numpy()
+        if dtype:
+            array = array.astype(dtype)
+        return array
+
+    def contain_tensor(item):
+        if not isinstance(item, tuple):
+            item = [item]
+
+        for slice_item in item:
+            if isinstance(slice_item, slice):
+                if isinstance(slice_item.start, Variable)  \
+                    or isinstance(slice_item.stop, Variable) \
+                        or isinstance(slice_item.step, Variable):
+                    return True
+            else:
+                if isinstance(slice_item, Variable):
+                    return True
+        return False
+
+    def __getitem__(self, item):
+        def is_list_tuple(index, contain_type):
+            def _is_list_tuple(item):
+                if not (isinstance(item, (list, tuple)) or
+                        type(item) == contain_type):
+                    return False
+                if isinstance(item, (tuple, list)):
+                    for s in item:
+                        if not _is_list_tuple(s):
+                            return False
+                return True
+
+            if not isinstance(index, (tuple, list)):
+                return False
+            for s in index:
+                if not _is_list_tuple(s):
+                    return False
+            return True
+
+        if contain_tensor(item) or is_list_tuple(item, int):
+            # 1. Call _getitem_impl_ when item contains tensor.
+            # Why not call a c++ function ? Because item can't be parsed when it contains tensor.
+            return _getitem_impl_(self, item)
+
+        else:
+            # 2. Call c++ func getitem_index_not_tensor to speedup.
+            return self._getitem_index_not_tensor(item)
+
+    def __setitem__(self, item, value):
+        def contain_tensor_or_list(item):
+            if not isinstance(item, tuple):
+                item = [item]
+
+            for slice_item in item:
+                if isinstance(slice_item, list):
+                    return True
+                elif isinstance(slice_item, Variable):
+                    return True
+
+            return False
+
+        if contain_tensor_or_list(item):
+            # To reuse code with static graph,
+            # Call _setitem_impl_ when item contains tensor or list.
+            return _setitem_impl_(self, item, value)
+
+        else:
+            # Call c++ func __setitem_varbase__ to speedup.
+            return self.__setitem_varbase__(item, value)
 
     for method_name, method in (
         ("__bool__", __bool__), ("__nonzero__", __nonzero__),
@@ -445,7 +616,9 @@ def monkey_patch_varbase():
         ("gradient", gradient), ("register_hook", register_hook),
         ("__str__", __str__), ("__repr__", __str__),
         ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
-        ("__name__", "Tensor"), ("__array__", __array__)):
+        ("__name__", "Tensor"), ("__array__", __array__),
+        ("__getitem__", __getitem__), ("item", item),
+        ("__setitem__", __setitem__)):
         setattr(core.VarBase, method_name, method)
 
     # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 0885891cdbe02747c8babbdbe748f21b30c34598..21f506d03ce68e7eb47d185c06aeab5f4ba4cabd 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -20,6 +20,8 @@ import unittest
 import numpy as np
 
 import paddle
+from paddle.fluid.layer_helper import LayerHelper
+from functools import reduce
 
 
 class TestSetValueBase(unittest.TestCase):
@@ -333,6 +335,134 @@ class TestSetValueItemTensor6(TestSetValueApi):
         self.data[2:0:-1, 0:2, ::-1] = self.value
 
 
+# 1.5 item is None
+class TestSetValueItemNone1(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[None] = self.value
+
+    def _get_answer(self):
+        self.data[None] = self.value
+
+
+class TestSetValueItemNone2(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[0, None, 1] = self.value
+
+    def _get_answer(self):
+        self.data[0, None, 1] = self.value
+
+
+class TestSetValueItemNone3(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[:, None, None, 1] = self.value
+
+    def _get_answer(self):
+        self.data[:, None, None, 1] = self.value
+
+
+class TestSetValueItemNone4(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[0, 0, None, 1] = self.value
+
+    def _get_answer(self):
+        self.data[0, 0, None, 1] = self.value
+
+
+class TestSetValueItemNone5(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[0, None, 0, None, 1] = self.value
+
+    def _get_answer(self):
+        self.data[0, None, 0, None, 1] = self.value
+
+
+class TestSetValueItemNone6(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[None, 0, 0, None, 0] = self.value
+
+    def _get_answer(self):
+        self.data[None, 0, 0, None, 0] = self.value
+
+
+class TestSetValueItemNone7(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[:, None, 1] = np.zeros(self.shape)[:, None, 0]
+
+    def _get_answer(self):
+        self.data[:, None, 1] = np.zeros(self.shape)[:, None, 0]
+
+
+class TestSetValueItemNone8(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[:, 1, None] = np.zeros(self.shape)[:, 0, None]
+
+    def _get_answer(self):
+        self.data[:, 1, None] = np.zeros(self.shape)[:, 0, None]
+
+
+class TestSetValueItemNone9(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[None, :, 1, ..., None] = np.zeros(self.shape)[0, 0, :, None]
+
+    def _get_answer(self):
+        self.data[None, :, 1, ..., None] = np.zeros(self.shape)[0, 0, :, None]
+
+
+# 1.5 item is list or Tensor of bol
+class TestSetValueItemBool1(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[[True, False]] = self.value
+
+    def _get_answer(self):
+        self.data[[True, False]] = self.value
+
+
+class TestSetValueItemBool2(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[[False, False]] = self.value
+
+    def _get_answer(self):
+        self.data[[False, False]] = self.value
+
+
+class TestSetValueItemBool3(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[[False, True]] = np.zeros(self.shape[2])
+
+    def _get_answer(self):
+        self.data[[False, True]] = np.zeros(self.shape[2])
+
+
+class TestSetValueItemBool4(TestSetValueApi):
+    def _call_setitem(self, x):
+        idx = paddle.assign(np.array([False, True]))
+        x[idx] = np.zeros(self.shape[2])
+
+    def _get_answer(self):
+        self.data[np.array([False, True])] = np.zeros(self.shape[2])
+
+
+class TestSetValueItemBool5(TestSetValueApi):
+    def _call_setitem(self, x):
+        idx = paddle.assign(
+            np.array([[False, True, False], [True, True, False]]))
+        x[idx] = self.value
+
+    def _get_answer(self):
+        self.data[np.array([[False, True, False], [True, True, False]
+                            ])] = self.value
+
+
+class TestSetValueItemBool6(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[0, ...] = 0
+        x[x > 0] = self.value
+
+    def _get_answer(self):
+        self.data[0, ...] = 0
+        self.data[self.data > 0] = self.value
+
+
 # 2. Test different type of value: int, float, numpy.ndarray, Tensor
 # 2.1 value is int32, int64, float32, float64, bool
 
@@ -755,6 +885,21 @@ class TestError(TestSetValueBase):
             one = paddle.ones([1])
             x[::one] = self.value
 
+    def _bool_list_error(self):
+        with self.assertRaises(TypeError):
+            x = paddle.ones(shape=self.shape, dtype=self.dtype)
+            x[[True, False, 0]] = 0
+
+        with self.assertRaises(IndexError):
+            x = paddle.ones(shape=self.shape, dtype=self.dtype)
+            x[[True, False], [True, False]] = 0
+
+    def _bool_tensor_error(self):
+        with self.assertRaises(IndexError):
+            x = paddle.ones(shape=self.shape, dtype=self.dtype)
+            idx = paddle.assign([True, False, True])
+            x[idx] = 0
+
     def _broadcast_mismatch(self):
         program = paddle.static.Program()
         with paddle.static.program_guard(program):
@@ -762,8 +907,7 @@ class TestError(TestSetValueBase):
             value = np.array([3, 4, 5, 6, 7])
             x[0] = value
         exe = paddle.static.Executor(paddle.CPUPlace())
-        with self.assertRaisesRegexp(ValueError,
-                                     "Broadcast dimension mismatch."):
+        with self.assertRaises(ValueError):
             exe.run(program)
 
     def test_error(self):
@@ -772,8 +916,391 @@ class TestError(TestSetValueBase):
             self._value_type_error()
             self._dtype_error()
             self._step_error()
+            self._bool_list_error()
+            self._bool_tensor_error()
         self._broadcast_mismatch()
 
 
+# 5. Test backward
+
+
+class Model(paddle.nn.Layer):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.conv = paddle.nn.Conv2D(12, 12, 3)
+
+    def forward(self, x, y):
+        x = self.conv(x)
+        y = self.conv(y)
+        var = y.flatten()
+
+        x[0, :, 0, 0] = var
+        loss = paddle.mean(x)
+        return loss, var, x
+
+
+class TestBackward(unittest.TestCase):
+    def test_static(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+
+        x_np = np.random.random(size=(4, 4)).astype('float32')
+        y_np = np.random.random(size=(4, 4)).astype('float32')
+        label_np = np.random.randint(2, size=(4, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
+            y = paddle.static.data(name="y", shape=[4, 4], dtype='float32')
+
+            label = paddle.static.data(
+                name="label", shape=[4, 1], dtype='int64')
+
+            z = paddle.add(x, y)
+            var = y[0, :]
+            z[0, :] = var
+
+            prediction = paddle.static.nn.fc(x=z, size=2, activation='softmax')
+
+            cost = paddle.nn.functional.cross_entropy(
+                input=prediction, label=label)
+            loss = paddle.mean(cost)
+            sgd = paddle.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        exe.run(startup_program)
+
+        var_grad, z_grad = exe.run(
+            main_program,
+            feed={"x": x_np,
+                  "y": y_np,
+                  "label": label_np},
+            fetch_list=[var.name + "@GRAD", z.name + "@GRAD"])
+
+        self.assertTrue((var_grad == z_grad[0, :]).all())
+
+    def test_dynamic(self):
+        paddle.disable_static()
+        model = Model()
+        x = paddle.ones([1, 12, 3, 3]).astype("float32")
+        y = paddle.ones([1, 12, 3, 3]).astype("float32")
+        loss, var, x = model(x, y)
+        loss.backward()
+
+        self.assertTrue(var.grad.shape == x.grad[0, :, 0, 0].shape)
+        # 
+        self.assertTrue((0 == x.grad[0, :, 0, 0]).all())
+
+
+class TestGradientTruncated(unittest.TestCase):
+    def test_consistent_with_competitor(self):
+        paddle.disable_static()
+
+        def set_value(t, value):
+            a = t * t
+            a[0, 1] = value
+            y = a * a
+            return y.sum()
+
+        # case 1
+        array = np.arange(
+            1, 1 + 2 * 3 * 4, dtype="float32").reshape([1, 2, 1, 3, 1, 4])
+        value = np.arange(100, 104, dtype="float32").reshape(1, 4)
+
+        inps = paddle.to_tensor(array, stop_gradient=False)
+        value = paddle.to_tensor(value, stop_gradient=False)
+
+        loss = set_value(inps, value)
+        loss.backward()
+
+        value_grad = np.array([[600., 606., 612., 618.]])
+        input_grad = np.array(
+            [[[[[[4., 32., 108., 256.]], [[500., 864., 1372., 2048.]],
+                [[2916., 4000., 5324., 6912.]]]],
+              [[[[0., 0., 0., 0.]], [[0., 0., 0., 0.]], [[0., 0., 0., 0.]]]]]])
+        self.assertTrue(
+            np.array_equal(inps.grad.numpy(), input_grad),
+            msg="The gradient of value should be \n{},\n but reveived {}".
+            format(input_grad, inps.grad.numpy()))
+        self.assertTrue(
+            np.array_equal(value.grad.numpy(), value_grad),
+            msg="The gradient of input should be \n{},\n but reveived {}".
+            format(value_grad, value.grad.numpy()))
+
+        # case 2
+        array = np.arange(1, 2 * 3 * 4 + 1, dtype="float32").reshape([4, 2, 3])
+        value = np.arange(100, 100 + 1, dtype="float32")
+
+        inps2 = paddle.to_tensor(array, stop_gradient=False)
+        value2 = paddle.to_tensor(value, stop_gradient=False)
+
+        loss = set_value(inps2, value2)
+        loss.backward()
+
+        value_grad2 = np.array([600.])
+        input_grad2 = np.array(
+            [[[4., 32., 108.], [0., 0., 0.]], [[1372., 2048., 2916.],
+                                               [4000., 5324., 6912.]],
+             [[8788., 10976., 13500.], [16384., 19652., 23328.]],
+             [[27436., 32000., 37044.], [42592., 48668., 55296.]]])
+        self.assertTrue(
+            np.array_equal(inps2.grad.numpy(), input_grad2),
+            msg="The gradient of value should be \n{},\n but reveived {}".
+            format(input_grad, inps2.grad.numpy()))
+        self.assertTrue(
+            np.array_equal(value2.grad.numpy(), value_grad2),
+            msg="The gradient of input should be \n{},\n but reveived {}".
+            format(value_grad, value2.grad.numpy()))
+
+        # case 3
+        def set_value3(t, value):
+            a = t * t
+            a[0, :, 0, :] = value
+            y = a * a
+            return y.sum()
+
+        array = np.arange(
+            1, 1 + 2 * 3 * 4, dtype="float32").reshape([4, 3, 1, 1, 2, 1])
+        value = np.arange(100, 100 + 2, dtype="float32").reshape(1, 2, 1)
+
+        inps = paddle.to_tensor(array, stop_gradient=False)
+        value = paddle.to_tensor(value, stop_gradient=False)
+
+        loss = set_value3(inps, value)
+        loss.backward()
+
+        value_grad = np.array([[[600.], [606.]]])
+        input_grad = np.array(
+            [[[[[[0.], [0.]]]], [[[[0.], [0.]]]], [[[[0.], [0.]]]]],
+             [[[[[1372.], [2048.]]]], [[[[2916.], [4000.]]]],
+              [[[[5324.], [6912.]]]]], [[[[[8788.], [10976.]]]], [[[[13500.],
+                                                                    [16384.]]]],
+                                        [[[[19652.], [23328.]]]]],
+             [[[[[27436.], [32000.]]]], [[[[37044.], [42592.]]]],
+              [[[[48668.], [55296.]]]]]])
+        self.assertTrue(
+            np.array_equal(inps.grad.numpy(), input_grad),
+            msg="The gradient of value should be \n{},\n but reveived {}".
+            format(input_grad, inps.grad.numpy()))
+        self.assertTrue(
+            np.array_equal(value.grad.numpy(), value_grad),
+            msg="The gradient of input should be \n{},\n but reveived {}".
+            format(value_grad, value.grad.numpy()))
+
+        #case 4: step >0
+        def set_value4(t, value):
+            a = t * t
+            a[0, :, 0, ::3] = value
+            y = a * a
+            return y.sum()
+
+        array = np.arange(
+            1, 1 + 2 * 3 * 4, dtype="float32").reshape([2, 3, 1, 4, 1])
+        value = np.arange(100, 100 + 2, dtype="float32").reshape(1, 2, 1)
+
+        inps = paddle.to_tensor(array, stop_gradient=False)
+        value = paddle.to_tensor(value, stop_gradient=False)
+
+        loss = set_value4(inps, value)
+        loss.backward()
+
+        value_grad = np.array([[[600.], [606.]]])
+        input_grad = np.array([[[[[0.], [32.], [108.],
+                                  [0.]]], [[[0.], [864.], [1372.], [0.]]],
+                                [[[0.], [4000.], [5324.], [0.]]]],
+                               [[[[8788.], [10976.], [13500.], [16384.]]],
+                                [[[19652.], [23328.], [27436.], [32000.]]],
+                                [[[37044.], [42592.], [48668.], [55296.]]]]])
+        self.assertTrue(
+            np.array_equal(inps.grad.numpy(), input_grad),
+            msg="The gradient of value should be \n{},\n but reveived {}".
+            format(input_grad, inps.grad.numpy()))
+        self.assertTrue(
+            np.array_equal(value.grad.numpy(), value_grad),
+            msg="The gradient of input should be \n{},\n but reveived {}".
+            format(value_grad, value.grad.numpy()))
+
+        # case 5:a[0].shape==value.shape
+        def set_value5(t, value):
+            a = t * t
+            a[0] = value
+            y = a * a
+            return y.sum()
+
+        array = np.arange(1, 1 + 2 * 3 * 4, dtype="float32").reshape([2, 3, 4])
+        value = np.arange(100, 100 + 12, dtype="float32").reshape(3, 4)
+
+        inps = paddle.to_tensor(array, stop_gradient=False)
+        value = paddle.to_tensor(value, stop_gradient=False)
+
+        loss = set_value5(inps, value)
+        loss.backward()
+
+        value_grad = np.array([[200., 202., 204., 206.],
+                               [208., 210., 212., 214.],
+                               [216., 218., 220., 222.]])
+        input_grad = np.array([[[0., 0., 0., 0.], [0., 0., 0., 0.],
+                                [0., 0., 0., 0.]],
+                               [[8788., 10976., 13500., 16384.],
+                                [19652., 23328., 27436., 32000.],
+                                [37044., 42592., 48668., 55296.]]])
+        self.assertTrue(
+            np.array_equal(inps.grad.numpy(), input_grad),
+            msg="The gradient of value should be \n{},\n but reveived {}".
+            format(input_grad, inps.grad.numpy()))
+        self.assertTrue(
+            np.array_equal(value.grad.numpy(), value_grad),
+            msg="The gradient of input should be \n{},\n but reveived {}".
+            format(value_grad, value.grad.numpy()))
+
+    def test_static_graph(self):
+        paddle.enable_static()
+
+        to_string = lambda x, i, : x + '_' + str(i)
+        numel = lambda input_shape: reduce(lambda x, y: x * y, input_shape)
+
+        def op1(x):
+            value = paddle.fluid.layers.fill_constant([1], "float32", 1)
+            # test stop_gradient 
+            value.stop_gradient = True
+            x.stop_gradient = False
+            start = paddle.fluid.layers.fill_constant(
+                [1], "int32", 5, force_cpu=True)
+            end = paddle.fluid.layers.fill_constant(
+                [1], "int32", 0, force_cpu=True)
+            step = paddle.fluid.layers.fill_constant(
+                [1], "int32", -2, force_cpu=True)
+
+            inputs = {
+                'Input': x,
+                'ValueTensor': value,
+                'StartsTensorList': [start, ],
+                'EndsTensorList': [end, ],
+                'StepsTensorList': [step, ]
+            }
+
+            helper = LayerHelper("set_value")
+            y = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+            helper.append_op(
+                type="set_value",
+                inputs=inputs,
+                outputs={'Out': y},
+                attrs={'axes': [0]})
+
+            return y, value
+
+        def op2(x):
+            value = paddle.fluid.layers.fill_constant([1, 3, 2], "float32", 1)
+            # test stop_gradient 
+            value.stop_gradient = False
+            x.stop_gradient = False
+            attrs = {
+                'axes': [0],
+                'starts': [6],
+                'ends': [0],
+                'steps': [-4],
+                'decrease_axes': [],
+                'none_axes': [],
+                'dtype': paddle.float32
+            }
+            inputs = {'Input': x, 'ValueTensor': value}
+
+            helper = LayerHelper("set_value")
+            y = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+            helper.append_op(
+                type="set_value",
+                inputs=inputs,
+                outputs={'Out': y},
+                attrs=attrs)
+
+            return y, value
+
+        def op3(x):
+            value = paddle.fluid.layers.fill_constant([1], "float32", 1)
+            x.stop_gradient = True
+            value.stop_gradient = False
+            start = paddle.fluid.layers.fill_constant(
+                [1], "int32", 0, force_cpu=True)
+            end = paddle.fluid.layers.fill_constant(
+                [1], "int32", 5, force_cpu=True)
+            step = paddle.fluid.layers.fill_constant(
+                [1], "int32", 3, force_cpu=True)
+
+            inputs = {
+                'Input': x,
+                'ValueTensor': value,
+                'StartsTensorList': [start, ],
+                'EndsTensorList': [end, ],
+                'StepsTensorList': [step, ]
+            }
+
+            helper = LayerHelper("set_value")
+            y = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+            helper.append_op(
+                type="set_value",
+                inputs=inputs,
+                outputs={'Out': y},
+                attrs={'axes': [0]})
+
+            return y, value
+
+        def set_value(array, i, op):
+            name_x = to_string('x', i)
+            x = paddle.static.data(
+                name=name_x, shape=array.shape, dtype='float32')
+
+            # set_value_op in __get/setitem__ is an inplace operation. 
+            # When `input.stop_gradient = True` and `value.stop_gradient = False`, 
+            # set_value_grad_op will not be run during backward.
+            y, value = op(x)
+
+            y2 = y + 1
+            loss = paddle.fluid.layers.reduce_sum(y2)
+            sgd = paddle.optimizer.Adam()
+            sgd.minimize(loss)
+            place = paddle.fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else paddle.fluid.CUDAPlace(0)
+
+            prog = paddle.static.default_main_program()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            fetch_list = []
+            if not x.stop_gradient:
+                fetch_list.append(x.grad_name)
+            if not value.stop_gradient:
+                fetch_list.append(value.grad_name)
+            out = exe.run(prog, feed={x.name: array}, fetch_list=fetch_list)
+            return out
+
+        input_shape = [7, 6, 5, 4, 3, 2]
+
+        array = np.arange(
+            0, numel(input_shape), dtype="float32").reshape(input_shape)
+
+        for i in range(len(input_shape)):
+            program = paddle.static.Program()
+            with paddle.static.program_guard(program):
+                out1 = set_value(array, i, op1)
+                self.assertTrue((out1[0][5:0:-2] == 0).all())
+
+            if len(array.shape) > 2:
+                program2 = paddle.static.Program()
+                with paddle.static.program_guard(program2):
+                    out2 = set_value(array, i, op2)
+                    self.assertTrue((out2[0][6:0:-4] == 0).all())
+
+            program3 = paddle.static.Program()
+            with paddle.static.program_guard(program3):
+                out3 = set_value(array, i, op3)
+                self.assertTrue((numel(out1[0][0:5:3].shape) == out3[0]).all())
+
+            array = array[0]
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b9a82ba85f05a92c3f783081b4bbb3570250272
--- /dev/null
+++ b/python/paddle/fluid/variable_index.py
@@ -0,0 +1,701 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import numpy as np
+from . import unique_name
+from . import core
+import paddle
+
+MAX_INTEGER = 2**31 - 1
+
+
+def is_list_tuple(index, contain_type):
+    def _is_list_tuple(item):
+        if not (isinstance(item, (list, tuple)) or type(item) == contain_type):
+            return False
+        if isinstance(item, (tuple, list)):
+            for s in item:
+                if not _is_list_tuple(s):
+                    return False
+        return True
+
+    if not isinstance(index, (tuple, list)):
+        return False
+    for s in index:
+        if not _is_list_tuple(s):
+            return False
+    return True
+
+
+def is_one_dim_list(index, contain_type):
+    if isinstance(index, list):
+        for i in index:
+            if not isinstance(i, contain_type):
+                return False
+    else:
+        return False
+    return True
+
+
+def get_list_index_shape(var_dims, index_dims):
+    var_dims_size = len(var_dims)
+    index_dims_size = len(index_dims)
+
+    out_dims_size = var_dims_size - index_dims[0] + index_dims_size - 1
+
+    out_dims_shape = [1] * out_dims_size
+
+    out_dims_shape[:index_dims_size - 1] = index_dims[1:]
+
+    out_dims_shape[index_dims_size - 1:] = var_dims[index_dims[0]:]
+    return out_dims_shape
+
+
+class SliceInfo:
+    def __init__(self):
+        self.pre_shape = None
+        self.indexes = []
+
+    def update(self, index):
+        if is_list_tuple(index, int) or isinstance(index, (
+                paddle.fluid.Variable, np.ndarray)):
+            # convert index to Tensor
+            if not isinstance(index, paddle.fluid.Variable):
+                index = paddle.assign(index)
+
+            self.indexes.append(index)
+
+            if self.pre_shape is None:
+                self.pre_shape = index.shape
+            else:
+                if self.pre_shape != index.shape:
+                    # broadcast 
+                    cur_shape = paddle.broadcast_shape(self.pre_shape,
+                                                       index.shape)
+                    for i in range(len(self.indexes)):
+                        self.indexes[i] = paddle.broadcast_to(self.indexes[i],
+                                                              cur_shape)
+                self.pre_shape = self.indexes[-1].shape
+        else:
+            raise ValueError(
+                "Index should be list/tuple of int or Tensor, but received {}.".
+                format(index))
+
+    def shape_stride(self, shape):
+        s = [1] * len(shape)
+        for i in range(len(shape) - 2, -1, -1):
+            s[i] = shape[i + 1] * s[i + 1]
+
+        return s
+
+    def numel(self, shape):
+        return reduce(lambda x, y: x * y, shape)
+
+    def get_offset_stride(self, tensor_shape):
+        for index in self.indexes:
+            if not isinstance(index, paddle.fluid.Variable):
+                raise ValueError(
+                    "only support list/tensor index, but received {}.".format(
+                        type(index)))
+
+        if len(self.indexes) <= len(tensor_shape) or len(self.indexes) == 1:
+            shape = paddle.stack(self.indexes)
+            axes = list(range(1, len(self.pre_shape) + 1)) + [0, ]
+
+        else:
+            raise ValueError(
+                "too many indices for tensor: tensor is {}-dimensional, but {} were indexed".
+                format(len(tensor_shape), self.pre_shape[0]))
+
+        shape_transpose = paddle.transpose(shape, axes)
+        return shape_transpose
+
+    def get_item(self, tensor):
+        shape_transpose = self.get_offset_stride(tensor.shape)
+        index = paddle.assign(shape_transpose)
+        return paddle.gather_nd(tensor, index)
+
+    def set_item(self, tensor_origin, value):
+
+        if not isinstance(value, paddle.fluid.Variable):
+            value = paddle.assign(value)
+        tensor_type = None
+
+        if tensor_origin.dtype in [
+                core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64
+        ]:
+            tensor = tensor_origin
+        else:
+            tensor_type = tensor_origin.dtype
+            tensor = tensor_origin.astype(core.VarDesc.VarType.FP32)
+
+        if value.dtype != tensor.dtype:
+            value = value.astype(tensor.dtype)
+
+        shape_transpose = self.get_offset_stride(tensor_origin.shape)
+        index = paddle.assign(shape_transpose)
+
+        gather_tensor_shape = get_list_index_shape(
+            tensor.shape, [len(self.indexes), ] + list(self.indexes[-1].shape))
+
+        value_dims_bd = [1, ] * len(gather_tensor_shape)
+        value_dims_bd[-len(value.shape):] = list(value.shape)
+
+        for i in range(len(gather_tensor_shape)):
+            if not (value_dims_bd[i] == gather_tensor_shape[i] or
+                    value_dims_bd[i] == 1):
+                raise ValueError("{} can not broadcast into {}".format(
+                    value.shape, gather_tensor_shape))
+
+        value_broadcast = paddle.broadcast_to(value, gather_tensor_shape)
+
+        value_1d = value_broadcast.reshape([-1] + gather_tensor_shape[len(
+            index.shape) - 1:])
+
+        index_1d = index.reshape([-1, index.shape[-1]])
+
+        tensor_stride = paddle.assign(
+            self.shape_stride(tensor.shape[:index.shape[-1]]))
+        inds = []
+        for i in range(index_1d.shape[0]):
+            temp = (index_1d[i] * tensor_stride).sum()
+            inds.append(temp)
+        index_1d = paddle.stack(inds).reshape([-1])
+        t_reshape = tensor.reshape([-1] + list(tensor.shape[index.shape[-1]:]))
+        out = paddle.scatter(t_reshape, index_1d, value_1d)
+        if tensor_type is not None:
+            out = out.astype(tensor_type)
+        tensor_origin[:] = out.reshape(tensor_origin.shape)
+
+        return tensor_origin
+
+
+def replace_ellipsis(var, item):
+    from .framework import Variable
+    # Use slice(None) to replace Ellipsis.
+    # For var, var.shape = [3,4,5,6]
+    #
+    #   var[..., 1:2] -> var[:, :, :, 1:2]
+    #   var[0, ...] -> var[0]
+    #   var[0, ..., 1:2] -> var[0, :, :, 1:2]
+
+    item = list(item)
+
+    # Remove Variable to skip bug when counting Ellipsis
+    item_remove_var = [
+        ele for ele in item if not isinstance(ele, (Variable, np.ndarray))
+    ]
+    ell_count = item_remove_var.count(Ellipsis)
+    if ell_count == 0:
+        return item
+    elif ell_count > 1:
+        raise IndexError("An index can only have a single ellipsis ('...')")
+
+    ell_idx = item.index(Ellipsis)
+
+    if ell_idx == len(item) - 1:
+        return item[:-1]
+    else:
+        item[ell_idx:ell_idx + 1] = [slice(None)] * (
+            len(var.shape) - len(item) + 1)
+
+    return item
+
+
+def replace_none(item):
+    new_item = []
+    none_axes = []
+    for i, slice_item in enumerate(item):
+        if slice_item is None:
+            none_axes.append(i)
+        else:
+            new_item.append(slice_item)
+    return new_item, none_axes
+
+
+def is_integer_or_scalar_tensor(ele):
+    from .framework import Variable
+    if isinstance(ele, int):
+        return True
+    elif isinstance(ele, Variable):
+        if len(ele.shape) == 1 and ele.shape[0] == 1:
+            return True
+    return False
+
+
+def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags):
+    from .framework import Variable
+    from .layers import utils
+
+    if utils._contain_var(attr):
+        inputs[tensor_attr_name] = utils._convert_to_tensor_list(
+            attr, dtype="int64")
+        for i, dim in enumerate(attr):
+            if isinstance(dim, Variable):
+                attrs[attr_name].append(-1)
+                infer_flags[i] = -1
+            else:
+                attrs[attr_name].append(dim)
+    else:
+        attrs[attr_name] = attr
+
+
+def _getitem_impl_(var, item):
+    """
+    Slice the variable.
+
+    Args:
+        item(int/slice/tuple) : the index.
+
+    Returns:
+        Sliced variable
+    """
+    from .framework import default_main_program, Variable
+    if isinstance(item, list):
+        if not is_one_dim_list(item, int):
+            item = tuple(item)
+
+    if not isinstance(item, tuple):
+        item = (item, )
+
+    decrease_axes = []
+    axes = []
+    starts = []
+    ends = []
+    steps = []
+    reverse_axes = []
+
+    use_strided_slice = False
+    item, none_axes = replace_none(item)
+    item = replace_ellipsis(var, item)
+    slice_info = SliceInfo()
+
+    for dim, slice_item in enumerate(item):
+        if is_integer_or_scalar_tensor(slice_item):
+            if isinstance(slice_item,
+                          int) and var.shape[dim] is not None and var.shape[
+                              dim] >= 0 and slice_item >= var.shape[dim]:
+                # For python, if users write a, b = var, the __getitem__
+                # method will iterate through 0, 1, 2 ... until __getitem__
+                # throws an IndexError, then stop. The var[0], var[1] will
+                # be given to a, b respectively. If more values are given,
+                # the unpack size would cause error.
+                #
+                # We raises IndexError here to support grammar like `a, b = var`
+                raise IndexError(
+                    "slice_item %d at dim %d should be >= 0 and < var.shape[%d]: %d"
+                    % (slice_item, dim, dim, var.shape[dim]))
+            decrease_axes.append(dim)
+            start = slice_item
+            step = 1
+            end = slice_item + 1 if slice_item != -1 else MAX_INTEGER
+
+        elif isinstance(slice_item, slice):
+            start = slice_item.start
+            end = slice_item.stop
+            step = slice_item.step
+
+            if start is None and end is None and step is None:
+                continue
+
+            step = 1 if step is None else step
+
+            if start is None:
+                start = 0 if step > 0 else MAX_INTEGER
+            if end is None:
+                end = MAX_INTEGER if step > 0 else -1
+
+        elif isinstance(slice_item, list):
+            all_bool = True
+
+            if is_list_tuple(slice_item, int):
+                slice_info.update(slice_item)
+                continue
+
+            for i in slice_item:
+                if type(i) is int:
+                    all_bool = False
+                elif not isinstance(i, bool):
+                    raise TypeError("Only support int or bool in index list.")
+
+            if len(item) != 1:
+                raise IndexError(
+                    "When index contains a list, its length must be 1, but received {}.".
+                    format(len(item)))
+            new_slice_item = []
+            if all_bool:
+                if len(slice_item) != var.shape[0]:
+                    raise IndexError(
+                        "The dimension of bool index doesn't match indexed array along "\
+                        "dimension 0, the target dimension is {}, but received {}.".
+                        format(var.shape[0], len(slice_item)))
+                for idx, ele in enumerate(slice_item):
+                    if ele is True:
+                        new_slice_item.append(idx)
+                slice_item = new_slice_item
+            else:
+                for idx, ele in enumerate(slice_item):
+                    if type(ele) is int:
+                        new_slice_item.append(ele)
+                    elif ele is True:
+                        new_slice_item.append(1)
+                    else:
+                        new_slice_item.append(0)
+                slice_item = new_slice_item
+
+            from .layers import assign
+            from ..tensor import index_select
+
+            idx = assign(np.array(slice_item).astype("int32"))
+            return index_select(var, index=idx, axis=0)
+
+        elif isinstance(slice_item, np.ndarray):
+            slice_info.update(slice_item)
+            continue
+        elif isinstance(slice_item, (Variable)):
+            if len(item) == 1:
+
+                from ..tensor import index_select, gather_nd
+                from .layers.nn import where
+
+                if slice_item.dtype == paddle.bool:
+                    if len(slice_item.shape) > len(var.shape):
+                        raise IndexError(
+                            "The dims of bool index doesn't match indexed array, "
+                            "the dims of bool index except to be equal or less "
+                            "than {}, but received {}.".format(
+                                len(var.shape), len(slice_item.shape)))
+                    for i, dim_len in enumerate(slice_item.shape):
+                        if dim_len != var.shape[i]:
+                            raise IndexError(
+                                "The dimension of bool index doesn't match indexed array along "\
+                                "dimension {}, the target dimension is {}, but received {}.".
+                                format(i, var.shape[i], dim_len))
+                    bool_2_idx = where(slice_item == True)
+                    return gather_nd(var, bool_2_idx)
+                else:
+                    if len(slice_item.shape) == 1:
+                        return index_select(var, index=slice_item, axis=0)
+                    else:
+                        slice_info.update(slice_item)
+                        continue
+            else:
+                slice_info.update(slice_item)
+                continue
+
+        else:
+            raise IndexError(
+                "Valid index accept int or slice or ellipsis or list, but received {}.".
+                format(slice_item))
+
+        axes.append(dim)
+        starts.append(start)
+        ends.append(end)
+        steps.append(step)
+        use_strided_slice = True if step != 1 else use_strided_slice
+
+    if slice_info.indexes:
+        if len(slice_info.indexes) != len(item):
+            raise IndexError(
+                "Valid index accept int or slice or ellipsis or list, but received {}.".
+                format(item))
+        return slice_info.get_item(var)
+
+    inputs = {'Input': [var]}
+    attrs = {
+        'axes': axes,
+        'starts': [],
+        'ends': [],
+        'decrease_axis': decrease_axes
+    }
+    if use_strided_slice:
+        attrs['strides'] = []
+
+    infer_flags = [1] * len(axes)
+    deal_attrs(attrs, starts, "starts", "StartsTensorList", inputs, infer_flags)
+    deal_attrs(attrs, ends, "ends", "EndsTensorList", inputs, infer_flags)
+    deal_attrs(attrs, steps, "strides", "StridesTensorList", inputs,
+               infer_flags)
+    attrs['infer_flags'] = infer_flags
+
+    out = var
+    if len(axes) > 0:
+        target_block = default_main_program().current_block()
+        op_type = "strided_slice" if use_strided_slice else "slice"
+
+        slice_out_var = target_block.create_var(
+            name=unique_name.generate_with_ignorable_key(var.name + "_" +
+                                                         op_type),
+            dtype=var.dtype)
+        target_block.append_op(
+            type=op_type,
+            inputs=inputs,
+            outputs={'Out': [slice_out_var]},
+            attrs=attrs)
+        out = slice_out_var
+
+    if len(reverse_axes) > 0:
+        from .layers.tensor import reverse
+        out = reverse(out, axis=reverse_axes)
+
+    # Deal with cases when all axes are decreased.
+    # After slice, the shape of out is [1], which should have been [], but Paddle doesn't support scalar.
+    # In order to ensure the correctness of the final shape of out, one dimension of out needs to be decreased.
+    # For example:
+    # # x.shape: (2,3,4)
+    # out = x[0, 1, 1, None] # out.shape : (1)
+    if len(decrease_axes) == len(var.shape):
+        none_axes = none_axes[1:]
+
+    if len(none_axes) > 0:
+        # Deal with cases that decrease_axes is not empty
+        # For example:
+        # # x.shape: (2,3,4)
+        # out = x[0, 0:2, None] # out.shape : (2, 1, 4)
+        for idx, axis in enumerate(none_axes):
+            l = len([i for i in decrease_axes if i < axis])
+            new_axis = axis - l
+            none_axes[idx] = new_axis
+
+        # Deal with cases when all axes are decreased.
+        # After slice, the shape of out is [1], which should have been [], but Paddle doesn't support scalar.
+        # In order to ensure the correctness of the final shape of out, one dimension of out needs to be decreased.
+        # For example:
+        # # x.shape: (2,3,4)
+        # out = x[0, 1, 1, None] # out.shape : (1)
+
+        from ..tensor import unsqueeze
+        out = unsqueeze(out, axis=none_axes)
+
+    return out
+
+
+def _setitem_impl_(var, item, value):
+    from .framework import default_main_program, Variable
+
+    inputs = {'Input': var}
+    if isinstance(item, list):
+        if not is_one_dim_list(item, int):
+            item = tuple(item)
+    # 1. Parse item
+    if not isinstance(item, tuple):
+        item = (item, )
+
+    decrease_axes = []
+    axes = []
+    starts = []
+    ends = []
+    steps = []
+
+    item, none_axes = replace_none(item)
+    item = replace_ellipsis(var, item)
+    slice_info = SliceInfo()
+    dim = 0
+    for _, slice_item in enumerate(item):
+        if is_integer_or_scalar_tensor(slice_item):
+            decrease_axes.append(dim)
+            start = slice_item
+            end = slice_item + 1 if slice_item != -1 else MAX_INTEGER
+            step = 1
+
+        elif isinstance(slice_item, slice):
+            start = slice_item.start
+            end = slice_item.stop
+            step = slice_item.step
+
+            if start is None and end is None and step is None:
+                dim += 1
+                continue
+
+            step = 1 if step is None else step
+
+            if not isinstance(step, Variable) and step == 0:
+                raise ValueError(
+                    "When assign a value to a paddle.Tensor, step can not be 0, "
+                    "but received step is {}.".format(step))
+
+            if isinstance(step, Variable) and (start is None or end is None):
+                raise ValueError(
+                    "When assign a value to a paddle.Tensor, it's not supported that "
+                    "the start or end is None when the type of step is paddle.Tensor."
+                )
+
+            if start is None:
+                start = 0 if step > 0 else MAX_INTEGER
+
+            if end is None:
+                end = MAX_INTEGER if step > 0 else (0 - MAX_INTEGER)
+        elif isinstance(slice_item, list):
+            if is_list_tuple(slice_item, int):
+                slice_info.update(slice_item)
+                continue
+
+            for i in slice_item:
+                if not isinstance(i, bool):
+                    raise TypeError("Doesn't support {} in index list.".format(
+                        type(i)))
+
+            if len(item) != 1:
+                raise IndexError(
+                    "When index contains a bool list, its length must be 1, but received {}.".
+                    format(len(item)))
+
+            from .layers import assign
+            idx_tensor = assign(slice_item)
+            return set_value_for_bool_tensor(var, idx_tensor, value)
+
+        elif isinstance(slice_item, np.ndarray):
+            slice_info.update(slice_item)
+            continue
+
+        elif isinstance(slice_item, Variable):
+            if slice_item.dtype == core.VarDesc.VarType.BOOL:
+                if len(item) != 1:
+                    raise IndexError(
+                        "When index contains a bool tensor, its length must be 1, but received {}.".
+                        format(len(item)))
+                return set_value_for_bool_tensor(var, slice_item, value)
+            else:
+                slice_info.update(slice_item)
+                continue
+        else:
+            raise IndexError(
+                "Valid index accept int, slice, ellipsis, None, list of bool, Variable, "
+                "but received {}.".format(slice_item))
+
+        axes.append(dim)
+        starts.append(start)
+        ends.append(end)
+        steps.append(step)
+
+        dim += 1
+    if slice_info.indexes:
+        if len(slice_info.indexes) != len(item):
+            raise IndexError(
+                "Valid index accept int or slice or ellipsis or list, but received {}.".
+                format(item))
+        return slice_info.set_item(var, value)
+    attrs = {
+        'axes': axes,
+        'starts': starts,
+        'ends': ends,
+        'steps': steps,
+        'decrease_axes': decrease_axes,
+        'none_axes': none_axes
+    }
+
+    from .layers import utils
+    if utils._contain_var(starts):
+        inputs['StartsTensorList'] = utils._convert_to_tensor_list(starts)
+        del attrs['starts']
+    if utils._contain_var(ends):
+        inputs['EndsTensorList'] = utils._convert_to_tensor_list(ends)
+        del attrs['ends']
+    if utils._contain_var(steps):
+        inputs['StepsTensorList'] = utils._convert_to_tensor_list(steps)
+        del attrs['steps']
+
+    # 2. Parse value
+    dtype = var.dtype
+    attrs['dtype'] = dtype
+
+    from .data_feeder import convert_dtype
+    #  2.1 value is an integer of float
+    if isinstance(value, (int, float)):
+        value = np.array([value]).astype(convert_dtype(dtype))
+
+    #  2.2 value is a np.ndarray
+    if isinstance(value, np.ndarray):
+        shape = list(value.shape)
+        if dtype == core.VarDesc.VarType.BOOL:
+            value_name = "bool_values"
+            values = [bool(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.FP64:
+            value_name = "fp64_values"
+            values = [float(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.INT64:
+            value_name = "int64_values"
+            values = [int(v) for v in value.flat]
+        else:
+            raise TypeError(
+                "When assign a numpy.ndarray, integer or float to a paddle.Tensor, "
+                "the data type of the paddle.Tensor must be bool, float32, int32 or int64, but "
+                "received %s." % convert_dtype(dtype))
+        attrs[value_name] = values
+        attrs["shape"] = shape
+
+    elif isinstance(value, Variable):
+        inputs["ValueTensor"] = value
+    else:
+        raise TypeError(
+            "Only support to assign an integer, float, numpy.ndarray or "
+            "paddle.Tensor to a paddle.Tensor, but received {}".format(
+                type(value)))
+
+    cur_block = default_main_program().current_block()
+    cur_block.append_op(
+        type="set_value", inputs=inputs, outputs={'Out': var}, attrs=attrs)
+
+    return var
+
+
+# the item is a tensor of bool 
+def set_value_for_bool_tensor(var, item, value):
+
+    # TODO(zyfncg): Now scatter_nd_add only support float32 and float64 tensor, 
+    # so in the current version we also only support float32 and float64 tensor, 
+    # this problem will be fixed in the future.
+    if var.dtype != core.VarDesc.VarType.FP32 and var.dtype != core.VarDesc.VarType.FP64:
+        raise TypeError("Only support float and double tensor for bool index, "
+                        "but received {}.".format(var.dtype))
+
+    if len(item.shape) > len(var.shape):
+        raise IndexError("The dims of bool index doesn't match indexed array, "
+                         "the dims of bool index except to be equal or less "
+                         "than {}, but received {}.".format(
+                             len(var.shape), len(item.shape)))
+    for i, dim_len in enumerate(item.shape):
+        if dim_len != var.shape[i]:
+            raise IndexError(
+                "The dimension of bool index doesn't match indexed array along "
+                "dimension {}, the target dimension is {}, but received {}.".
+                format(i, var.shape[i], dim_len))
+
+    def idx_not_empty(var, item, value):
+        from .framework import Variable
+        from .layers import assign
+        from .layers.nn import where
+        from ..tensor import gather_nd, scatter_nd_add
+
+        if not isinstance(value, Variable):
+            value = assign(value).cast(var.dtype)
+
+        idx = where(item)
+        gather_val = gather_nd(var, idx)
+        gather_val_new = value - gather_val
+        out = scatter_nd_add(var, idx, gather_val_new)
+        var[:] = out
+
+    from .layers.control_flow import cond
+    # If all the bool index is False, just do nothing
+    cond(item.any(), lambda: idx_not_empty(var, item, value))
+
+    return var