diff --git a/torch_rec_op/AsynchronousCompleteCumsum/README.md b/torch_rec_op/AsynchronousCompleteCumsum/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e16c330ab6ba92f0a9ad142db7514be7422729df
--- /dev/null
+++ b/torch_rec_op/AsynchronousCompleteCumsum/README.md
@@ -0,0 +1,114 @@
+## 目录结构
+| 目录                  | 描述                   |
+|---------------------|----------------------|
+| matmul_leakyrelu_custom.json       | MatmulLeakyReluCustom算子的原型定义json文件 |
+| [MatmulLeakyReluCustom](./MatmulLeakyReluCustom)       | MatmulLeakyReluCustom算子工程 |
+| [AclNNInvocation](./AclNNInvocation) | 通过aclnn调用的方式调用MatmulLeakyReluCustom算子工程。 |
+
+## 编译算子工程部署算子包
+
+### 1.获取源码包
+    
+ 可以使用以下两种方式下载，请选择其中一种进行源码准备。
+
+ - 命令行方式下载（下载时间较长，但步骤简单）。
+
+   ```    
+   # 开发环境，非root用户命令行中执行以下命令下载源码仓。    
+   cd ${HOME}     
+   git clone https://gitee.com/ascend/samples.git
+   ```
+   **注：如果需要切换到其它tag版本，以v0.5.0为例，可执行以下命令。**
+   ```
+   git checkout v0.5.0
+   ```   
+ - 压缩包方式下载（下载时间较短，但步骤稍微复杂）。   
+   **注：如果需要下载其它版本代码，请先请根据前置条件说明进行samples仓分支切换。**   
+   ``` 
+   # 1. samples仓右上角选择 【克隆/下载】 下拉框并选择 【下载ZIP】。    
+   # 2. 将ZIP包上传到开发环境中的普通用户家目录中，【例如：${HOME}/ascend-samples-master.zip】。     
+   # 3. 开发环境中，执行以下命令，解压zip包。     
+   cd ${HOME}    
+   unzip ascend-samples-master.zip
+   ```
+
+### 2.编译算子工程
+
+  编译自定义算子工程，构建生成自定义算子包
+
+  - 执行如下命令，切换到算子工程MatmulLeakyReluCustom目录
+
+    ```
+    cd $HOME/samples/operator/MatMulLeakyReluCustomSample/FrameworkLaunch/MatmulLeakyReluCustom
+    ```
+
+  - 修改CMakePresets.json中ASCEND_CANN_PACKAGE_PATH为CANN软件包安装后的实际路径。
+
+        
+    ```    
+    {
+        ……
+        "configurePresets": [
+            {
+                    ……
+                    "ASCEND_CANN_PACKAGE_PATH": {
+                        "type": "PATH",
+                        "value": "~/Ascend/ascend-toolkit/latest"        //请替换为CANN软件包安装后的实际路径。eg:/home/HwHiAiUser/Ascend/ascend-toolkit/latest
+                    },
+                    ……
+            }
+        ]
+    }
+    ```
+  - 在算子工程MatmulLeakyReluCustom目录下执行如下命令，进行算子工程编译。
+
+    ```
+    ./build.sh
+    ```
+    编译成功后，会在当前目录下创建build_out目录，并在build_out目录下生成自定义算子安装包custom_opp_<target os>_<target architecture>.run，例如“custom_opp_ubuntu_x86_64.run”。
+
+
+### 3.部署算子包
+
+  - 执行如下命令，在自定义算子安装包所在路径下，安装自定义算子包。
+
+    ```
+    cd build_out
+    ./custom_opp_<target os>_<target architecture>.run
+    ```
+    
+    命令执行成功后，自定义算子包中的相关文件将部署至当前环境的OPP算子库的vendors/customize目录中。
+
+## 配置环境变量
+
+  这里的\$HOME需要替换为CANN包的安装路径。
+  ```
+  export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest
+  ```
+
+## 通过aclnn调用的方式调用MatmulLeakyReluCustom算子工程
+
+### 样例运行
+
+  - 进入到样例目录
+        
+    ```    
+    cd $HOME/samples/operator/MatmulLeakyReluCustomSample/FrameworkLaunch/AclNNInvocation
+    ```
+
+  - 样例执行          
+
+    样例执行过程中会自动生成测试数据，然后编译与运行aclnn样例，最后检验运行结果。具体过程可参见run.sh脚本。
+    ```
+    bash run.sh
+    ```
+
+## 更新说明
+  | 时间 | 更新事项 |
+|----|------|
+| 2023/11/9 | 新增AclNNInvocation样例 |
+  
+
+## 已知issue
+
+  暂无
\ No newline at end of file
diff --git a/torch_rec_op/AsynchronousCompleteCumsum/asynchronous_complete_cumsum.json b/torch_rec_op/AsynchronousCompleteCumsum/asynchronous_complete_cumsum.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d282b0234e76f83c14215796b0003b58a6df61f
--- /dev/null
+++ b/torch_rec_op/AsynchronousCompleteCumsum/asynchronous_complete_cumsum.json
@@ -0,0 +1,30 @@
+[
+  {
+    "op": "AsynchronousCompleteCumsum",
+    "language": "cpp",
+    "input_desc": [
+      {
+        "name": "x",
+        "param_type": "required",
+        "format": [
+          "ND", "ND"
+        ],
+        "type": [
+          "int64", "int32"
+        ]
+      }
+    ],
+    "output_desc": [
+      {
+        "name": "y",
+        "param_type": "required",
+        "format": [
+          "ND", "ND"
+        ],
+        "type": [
+          "int64", "int32"
+        ]
+      }
+    ]
+  }
+]
\ No newline at end of file
diff --git a/torch_rec_op/AsynchronousCompleteCumsum/clear_all.sh b/torch_rec_op/AsynchronousCompleteCumsum/clear_all.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c8c732744cef69affbfeb52ed562088bd8d35083
--- /dev/null
+++ b/torch_rec_op/AsynchronousCompleteCumsum/clear_all.sh
@@ -0,0 +1,4 @@
+rm -rf asynchronous_complete_cumsum
+rm -rf aclnn_asynchronous_complete_cumsum/build
+rm -rf aclnn_asynchronous_complete_cumsum/input
+rm -rf aclnn_asynchronous_complete_cumsum/output
\ No newline at end of file
diff --git a/torch_rec_op/AsynchronousCompleteCumsum/creat.sh b/torch_rec_op/AsynchronousCompleteCumsum/creat.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0763f5c665f8399dc652805448630be8083b4f36
--- /dev/null
+++ b/torch_rec_op/AsynchronousCompleteCumsum/creat.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+
+# 查找msopgen的路径，加入到环境变量PATH中
+msopgen_path=$(find /usr/local/Ascend/ -name msopgen | grep bin)
+parent_dir=$(dirname "$msopgen_path")
+export PATH=$parent_dir:$PATH
+
+# 利用msopgen生成可编译文件
+rm -rf ./asynchronous_complete_cumsum
+python3 /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/bin/msopgen gen -i asynchronous_complete_cumsum.json -f tf -c ai_core-Ascend910 -lan cpp -out ./asynchronous_complete_cumsum -m 0 -op AsynchronousCompleteCumsum
+rm -rf asynchronous_complete_cumsum/op_kernel
+rm -rf asynchronous_complete_cumsum/host
+cp -rf op_kernel asynchronous_complete_cumsum/
+cp -rf op_host asynchronous_complete_cumsum/
+cd asynchronous_complete_cumsum
+
+# 判断当前目录下是否存在CMakePresets.json文件
+if [ ! -f "CMakePresets.json" ]; then
+  echo "ERROR, CMakePresets.json file not exist."
+  exit 1
+fi
+
+# 禁止生成CRC校验和
+sed -i 's/--nomd5/--nomd5 --nocrc/g' ./cmake/makeself.cmake
+
+# 修改cann安装路径
+sed -i 's:"/usr/local/Ascend/latest":"/usr/local/Ascend/ascend-toolkit/latest":g' CMakePresets.json
+# 修改vendor_name 防止覆盖之前vendor_name为customize的算子;
+# vendor_name需要和aclnn中的CMakeLists.txt中的CUST_PKG_PATH值同步，不同步aclnn会调用失败;
+# vendor_name字段值不能包含customize；包含会导致多算子部署场景CANN的vendors路径下config.ini文件内容截取错误
+sed -i 's:"customize":"asynchronous_complete_cumsum":g' CMakePresets.json
+
+bash build.sh
+
+# # 安装编译成功的算子包
+bash ./build_out/custom_opp*.run
diff --git a/torch_rec_op/AsynchronousCompleteCumsum/op_host/CMakeLists.txt b/torch_rec_op/AsynchronousCompleteCumsum/op_host/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2975907ef6c898778223103de7880a64760494d3
--- /dev/null
+++ b/torch_rec_op/AsynchronousCompleteCumsum/op_host/CMakeLists.txt
@@ -0,0 +1,96 @@
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs)
+
+opbuild(OPS_SRC ${ops_srcs}
+        OUT_DIR ${ASCEND_AUTOGEN_PATH}
+)
+
+add_library(cust_op_proto SHARED ${ops_srcs} ${ASCEND_AUTOGEN_PATH}/op_proto.cc)
+target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB)
+target_compile_options(cust_op_proto PRIVATE
+        -fvisibility=hidden
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_op_proto PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_op_proto PRIVATE
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME
+                      cust_opsproto_rt2.0
+)
+add_library(cust_optiling SHARED ${ops_srcs})
+target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB)
+target_compile_options(cust_optiling PRIVATE
+        -fvisibility=hidden
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_optiling PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_optiling PRIVATE
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME
+                      cust_opmaster_rt2.0
+)
+
+file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp)
+file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h)
+add_library(cust_opapi SHARED ${aclnn_src})
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_opapi PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase)
+
+add_custom_target(optiling_compat ALL
+                  COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$<TARGET_FILE_NAME:cust_optiling>
+                          ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+)
+
+install(TARGETS cust_op_proto
+        LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h
+        DESTINATION packages/vendors/${vendor_name}/op_proto/inc)
+install(TARGETS cust_optiling
+        LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+        DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling)
+install(TARGETS cust_opapi
+        LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib)
+install(FILES ${aclnn_inc}
+        DESTINATION packages/vendors/${vendor_name}/op_api/include)
diff --git a/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum.cpp b/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9209f3711e133a571bed6002585cf0a7cdd6bbf
--- /dev/null
+++ b/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum.cpp
@@ -0,0 +1,97 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+
+#include "asynchronous_complete_cumsum_tiling.h"
+#include "register/op_def_registry.h"
+
+namespace {
+    constexpr int32_t EMBEDDING_TYPE_INT64 = 0;
+    constexpr int32_t EMBEDDING_TYPE_INT32 = 1;
+}
+
+namespace optiling {
+    static ge::graphStatus TilingFunc(gert::TilingContext* context)
+    {
+        AsynchronousCompleteCumsumTilingData tiling;
+        auto inputTensor0 = context->GetInputTensor(0);
+        uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize();
+        uint32_t dimNum = context->GetInputShape(0)->GetOriginShape().GetDimNum();
+
+        ge::DataType inputDatatype = inputTensor0->GetDataType();
+        uint32_t embeddingType;
+        if (inputDatatype == ge::DT_INT64) {
+            embeddingType = EMBEDDING_TYPE_INT64;
+        } else if (inputDatatype == ge::DT_INT32) {
+            embeddingType = EMBEDDING_TYPE_INT32;
+        }
+
+        tiling.set_totalLength(totalLength);
+        tiling.set_dimNum(dimNum);
+        tiling.set_inputType(embeddingType);
+
+        if (dimNum != 1) {
+            printf("AsynchronousCompleteCumsum required the dim of input-0 is 1 but %ld ", dimNum);
+            return ge::FAILED;
+        }
+        context->SetBlockDim(1); //? 这里的block设置
+        tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+        context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+
+        return ge::GRAPH_SUCCESS;
+    }
+}
+
+
+namespace ge {
+    static ge::graphStatus InferShape(gert::InferShapeContext* context)
+    {
+        const gert::Shape* xShape = context->GetInputShape(0);
+        gert::Shape* yShape = context->GetOutputShape(0);
+
+        int64_t inputLength = xShape->GetShapeSize();
+        yShape->SetDim(0, inputLength+1);
+        return GRAPH_SUCCESS;
+    }
+}
+
+
+namespace ops {
+    class AsynchronousCompleteCumsum : public OpDef {
+    public:
+        explicit AsynchronousCompleteCumsum(const char* name) : OpDef(name)
+        {
+            this->Input("x")
+                    .ParamType(REQUIRED)
+                    .DataType({ge::DT_INT64, ge::DT_INT32})
+                    .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+                    .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
+            this->Output("y")
+                    .ParamType(REQUIRED)
+                    .DataType({ge::DT_INT64, ge::DT_INT32})
+                    .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+                    .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
+
+            this->SetInferShape(ge::InferShape);
+
+            this->AICore()
+                    .SetTiling(optiling::TilingFunc);
+
+            this->AICore().AddConfig("ascend910b");
+            this->AICore().AddConfig("ascend910");
+        }
+    };
+
+    OP_ADD(AsynchronousCompleteCumsum);
+}
diff --git a/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum_tiling.h b/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb3c27e5bbd08d44d108a2ddba90802458a5a99b
--- /dev/null
+++ b/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum_tiling.h
@@ -0,0 +1,30 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+
+#ifndef ASYNCHRONOUS_COMPLETE_CUMSUM_H
+#define ASYNCHRONOUS_COMPLETE_CUMSUM_H
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+    BEGIN_TILING_DATA_DEF(AsynchronousCompleteCumsumTilingData)
+    TILING_DATA_FIELD_DEF(uint32_t, totalLength); // 处理数据的总长度
+    TILING_DATA_FIELD_DEF(uint32_t, dimNum); // 数据的维度
+    TILING_DATA_FIELD_DEF(uint32_t, inputType); // 数据的类型
+    END_TILING_DATA_DEF;
+
+    REGISTER_TILING_DATA_CLASS(AsynchronousCompleteCumsum, AsynchronousCompleteCumsumTilingData)
+}
+
+#endif // ASYNCHRONOUS_COMPLETE_CUMSUM_H
diff --git a/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/CMakeLists.txt b/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8c94a952da2058b785affa6784d78b87dfe9b3d7
--- /dev/null
+++ b/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/CMakeLists.txt
@@ -0,0 +1,68 @@
+# set custom compile options
+if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
+    add_ops_compile_options(ALL OPTIONS -g -O0)
+endif()
+
+foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+
+    # generate aic-${compute_unit}-ops-info.json
+    add_ops_info_target(TARGET ops_info_gen_${compute_unit}
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json
+        OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+        INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}
+    )
+
+    # generate ascendc impl py once
+    if (NOT TARGET ascendc_impl_gen)
+        add_ops_impl_target(TARGET ascendc_impl_gen
+            OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+            IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR}
+            OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe
+            INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl
+        )
+    endif()
+
+    # dynamic shape binary compile
+    if (${ENABLE_BINARY_PACKAGE} AND NOT ${ENABLE_CROSS_COMPILE})
+        add_bin_compile_target(TARGET ascendc_bin_${compute_unit}
+            OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+            IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR}
+            ADP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/dynamic
+            OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+            INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
+            COMPUTE_UNIT ${compute_unit}
+        )
+        add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen)
+    endif()
+
+    if (${ENABLE_CROSS_COMPILE} AND ${ENABLE_BINARY_PACKAGE})
+        add_cross_compile_target(
+            TARGET bin_${compute_unit}
+            OUT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../kernel
+            INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/
+        )
+    endif()
+endforeach()
+
+# generate npu_supported_ops.json
+add_npu_support_target(TARGET npu_supported_ops
+    OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH}
+    OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core
+    INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE}
+)
+
+if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+endif()
+
+# install kernel file
+if (${ENABLE_SOURCE_PACKAGE})
+    file(GLOB KERNEL_FILES
+         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/*.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/*.py
+    )
+    install(FILES ${KERNEL_FILES}
+            DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
+    )
+endif()
diff --git a/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/asynchronous_complete_cumsum.cpp b/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/asynchronous_complete_cumsum.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60bc4e7455eeb351b49e1c5bdd90d43077dd5d71
--- /dev/null
+++ b/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/asynchronous_complete_cumsum.cpp
@@ -0,0 +1,35 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+extern "C" __global__ __aicore__ void asynchronous_complete_cumsum(GM_ADDR x, GM_ADDR y, GM_ADDR workspace,
+                                                                   GM_ADDR tiling) {
+    GET_TILING_DATA(tiling_data, tiling);
+    int64_t totalLen = tiling_data.totalLength;
+    uint32_t inputType = tiling_data.inputType;
+
+    switch (inputType)
+    {
+        case 0 :
+        {
+            __gm__ int64_t* xPtr = (__gm__ int64_t*) x;
+            __gm__ int64_t* yPtr = (__gm__ int64_t*) y;
+            *(yPtr) = 0;
+            for (int i=0; i<totalLen+1; i++) {
+                *(yPtr+i) = *(xPtr+i-1) + *(yPtr+i-1);
+            }
+            break;
+        }
+
+        case 1:
+        {
+            __gm__ int32_t* xPtr = (__gm__ int32_t*) x;
+            __gm__ int32_t* yPtr = (__gm__ int32_t*) y;
+            *(yPtr) = 0;
+            for (int i=0; i<totalLen+1; i++) {
+                *(yPtr+i) = *(xPtr+i-1) + *(yPtr+i-1);
+            }
+            break;
+        }
+    }
+}
\ No newline at end of file
diff --git a/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/run.sh b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..07e80be40ff7d092bfa68b2c781016cc3686e5c0
--- /dev/null
+++ b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/run.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+export ASCEND_GLOBAL_LOG_LEVEL=0
+
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+# 导出环境变量
+SHORT=v:,
+LONG=dtype:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        # float16, float, int32
+        (-v | --dtype)
+            DTYPE="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+if [ ! $ASCEND_HOME_DIR ]; then
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest
+    else
+        export ASCEND_HOME_DIR=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+#source $ASCEND_HOME_DIR/bin/setenv.bash
+
+export DDK_PATH=$ASCEND_HOME_DIR
+arch=$(uname -m)
+export NPU_HOST_LIB=$ASCEND_HOME_DIR/${arch}-linux/lib64
+
+function main {
+    # 1. 清除遗留生成文件和日志文件
+    rm -rf $HOME/ascend/log/*
+    rm ./input/*.bin
+    rm ./output/*.bin
+
+    # 2. 生成输入数据和真值数据
+    cd $CURRENT_DIR
+    python3 scripts/gen_data.py
+    if [ $? -ne 0 ]; then
+        echo "ERROR: generate input data failed!"
+        return 1
+    fi
+    echo "INFO: generate input data success!"
+
+    # 3. 编译acl可执行文件
+    cd $CURRENT_DIR; rm -rf build; mkdir -p build; cd build
+    cmake ../src
+    if [ $? -ne 0 ]; then
+        echo "ERROR: cmake failed!"
+        return 1
+    fi
+    echo "INFO: cmake success!"
+    make
+    if [ $? -ne 0 ]; then
+        echo "ERROR: make failed!"
+        return 1
+    fi
+    echo "INFO: make success!"
+
+    # 4. 运行可执行文件
+    cd $CURRENT_DIR/output
+    echo "INFO: execute op!"
+    ./execute_asynchronous_complete_cumsum_op
+
+    if [ $? -ne 0 ]; then
+        echo "ERROR: acl executable run failed! please check your project!"
+        return 1
+    fi
+    echo "INFO: acl executable run success!"
+    # grad_query.tofile("./output/golden_grad_query.bin")
+    # grad_key.tofile("./output/golden_grad_key.bin")
+    # grad_value.tofile("./output/golden_grad_value.bin")
+    # 5. 比较真值文件
+    cd $CURRENT_DIR
+    ret=`python3 scripts/verify_result.py ./output/y.bin ./output/golden.bin`
+    echo $ret
+    if [ "x$ret" == "xtest pass" ]; then
+        echo ""
+        echo "#####################################"
+        echo "INFO: you have passed the Precision!"
+        echo "#####################################"
+        echo ""
+    fi
+}
+
+main
diff --git a/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/scripts/gen_data.py b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ecd3a98426b472d1762579f0a3d76dcef1f9c01
--- /dev/null
+++ b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/scripts/gen_data.py
@@ -0,0 +1,32 @@
+# coding: UTF-8
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import os
+
+import numpy as np
+
+
+def gen_golden_data_simple():
+    input_x = np.random.uniform(1,100, [128]).astype(np.int64)
+    golden = np.cumsum(input_x)
+    golden = np.insert(golden, 0, 0).astype(np.int64)
+    os.system("mkdir -p input")
+    os.system("mkdir -p output")
+    input_x.tofile("./input/input_x.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/scripts/verify_result.py b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/scripts/verify_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf9b6c9608744ea650fbf051daddfe74e4aba4d2
--- /dev/null
+++ b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/scripts/verify_result.py
@@ -0,0 +1,41 @@
+# coding: UTF-8
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import numpy as np
+
+LOSS = 1e-3
+MINIMUM = 10e-10
+
+def verify_result(real_result_file, golden_file):
+    real_result = np.fromfile(real_result_file, dtype=np.int64)
+    golden = np.fromfile(golden_file, dtype=np.int64)
+    print("real result, example-> ", real_result_file, real_result[:129])
+    print("golden result, example-> ", golden_file, golden[:129])
+
+    result = np.abs(real_result - golden)
+    deno = np.maximum(np.abs(real_result), np.abs(golden))
+    result_atol = np.less_equal(result, loss)
+    result_rtol = np.less_equal(result / np.add(deno, minimum), loss)
+    if not result_rtol.all() and not result_atol.all():
+        if np.sum(result_rtol == False) > real_result.size * loss and np.sum(result_atol == False) > real_result.size * loss:
+            print("[ERROR] result error")
+            return False
+    print("test pass")
+    return True
+
+if __name__ == '__main__':
+    print("=============================x============")
+    verify_result("./output/y.bin", "./output/golden.bin")
\ No newline at end of file
diff --git a/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/CMakeLists.txt b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7fb7e9e0bc9a0bb4893841ea4954301f69bad919
--- /dev/null
+++ b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/CMakeLists.txt
@@ -0,0 +1,84 @@
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# CMake lowest version requirement
+cmake_minimum_required(VERSION 3.5.1)
+
+# project information
+project(acl_execute_asynchronous_complete_cumsum)
+
+# Compile options
+add_compile_options(-std=c++11)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output")
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output")
+
+set(INC_PATH $ENV{DDK_PATH})
+
+if (NOT DEFINED ENV{DDK_PATH})
+    set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest")
+    message(STATUS "set default INC_PATH: ${INC_PATH}")
+else ()
+    message(STATUS "env INC_PATH: ${INC_PATH}")
+endif()
+
+set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/asynchronous_complete_cumsum/op_api")
+
+set(LIB_PATH $ENV{NPU_HOST_LIB})
+
+# Dynamic libraries in the stub directory can only be used for compilation
+if (NOT DEFINED ENV{NPU_HOST_LIB})
+    set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub/")
+    set(LIB_PATH1 "/usr/local/Ascend/ascend-toolkit/latest/atc/lib64/stub/")
+    message(STATUS "set default LIB_PATH: ${LIB_PATH}")
+else ()
+    message(STATUS "env LIB_PATH: ${LIB_PATH}")
+endif()
+
+# Header path
+include_directories(
+    ${INC_PATH}/runtime/include
+    ${INC_PATH}/atc/include
+    ../../inc
+    ${CUST_PKG_PATH}/include
+)
+
+# add host lib path
+link_directories(
+    ${LIB_PATH}
+    ${LIB_PATH1}
+    ${CUST_PKG_PATH}/lib
+)
+
+add_definitions(-DOP_NAME=AsynchronousCompleteCumsum)
+add_definitions(-DINPUT_NUM=1)
+add_definitions(-DOUTPUT_NUM=1)
+add_executable(execute_asynchronous_complete_cumsum_op
+        ../../src/operator_desc.cpp
+        ../../src/op_runner.cpp
+        main.cpp
+        ../../src/op_runner.cpp
+        ../../src/ ../../src/common.cpp
+)
+
+target_link_libraries(execute_asynchronous_complete_cumsum_op
+    ascendcl
+    cust_opapi
+    acl_op_compiler
+    nnopbase
+    stdc++
+)
+
+install(TARGETS execute_asynchronous_complete_cumsum_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
diff --git a/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/main.cpp b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9dcfcbe8f4d15cead0152450d394a38f7ea12702
--- /dev/null
+++ b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/main.cpp
@@ -0,0 +1,171 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <iostream>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "acl/acl.h"
+#include "op_runner.h"
+
+#include "../../inc/common.h"
+
+bool g_isDevice = false;
+int deviceId = 0;
+
+OperatorDesc CreateOpDesc()
+{
+    // define operator
+    std::vector<int64_t> x_shape { 128 };
+    std::vector<int64_t> y_shape { 129 };
+    aclFormat format = ACL_FORMAT_ND;
+
+    OperatorDesc opDesc;
+    opDesc.AddInputTensorDesc(ACL_INT64, x_shape.size(), x_shape.data(), format);
+    opDesc.AddOutputTensorDesc(ACL_INT64, y_shape.size(), y_shape.data(), format);
+    return opDesc;
+}
+
+bool SetInputData(OpRunner &runner)
+{
+    size_t fileSize = 0;
+    ReadFile("../input/input_x.bin", fileSize, runner.GetInputBuffer<void>(0), runner.GetInputSize(0));
+    INFO_LOG("Set input success");
+    return true;
+}
+
+bool ProcessOutputData(OpRunner &runner)
+{
+    WriteFile("../output/y.bin", runner.GetOutputBuffer<void>(0), runner.GetOutputSize(0));
+    INFO_LOG("Write output success");
+    return true;
+}
+
+void DestoryResource()
+{
+    bool flag = false;
+    if (aclrtResetDevice(deviceId) != ACL_SUCCESS) {
+        ERROR_LOG("Reset device %d failed", deviceId);
+        flag = true;
+    }
+    INFO_LOG("Reset Device success");
+    if (aclFinalize() != ACL_SUCCESS) {
+        ERROR_LOG("Finalize acl failed");
+        flag = true;
+    }
+    if (flag) {
+        ERROR_LOG("Destory resource failed");
+    } else {
+        INFO_LOG("Destory resource success");
+    }
+}
+
+bool InitResource()
+{
+    // 创建输出目录output
+    std::string output = "../output";
+    if (access(output.c_str(), 0) == -1) {
+        int ret = mkdir(output.c_str(), 0700);
+        if (ret == 0) {
+            INFO_LOG("Make output directory successfully");
+        }
+        else {
+            ERROR_LOG("Make output directory fail");
+            return false;
+        }
+    }
+
+    // acl.json is dump or profiling config file
+    // aclInit接口 -> 初始化AscendCL (一个进程里面只能调用一次)。init里面的传参是dump\profiling config file, 可以不传
+    if (aclInit(NULL) != ACL_SUCCESS) {
+        ERROR_LOG("acl init failed");
+        return false;
+    }
+
+    // 运行管理资源申请
+    if (aclrtSetDevice(deviceId) != ACL_SUCCESS) {
+        ERROR_LOG("Set device failed. deviceId is %d", deviceId);
+        (void)aclFinalize();
+        return false;
+    }
+    INFO_LOG("Set device[%d] success", deviceId);
+
+    // runMode is ACL_HOST which represents app is running in host
+    // runMode is ACL_DEVICE which represents app is running in device
+    // 获取当前的RunMode
+    aclrtRunMode runMode;
+    if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) {
+        ERROR_LOG("Get run mode failed");
+        DestoryResource();
+        return false;
+    }
+    g_isDevice = (runMode == ACL_DEVICE);
+    INFO_LOG("Get RunMode[%d] success", runMode);
+
+    return true;
+}
+
+bool RunOp()
+{
+    // create op desc
+    OperatorDesc opDesc = CreateOpDesc();
+
+    // create Runner
+    OpRunner opRunner(&opDesc);
+    if (!opRunner.Init()) {
+        ERROR_LOG("Init OpRunner failed");
+        return false;
+    }
+
+    // Load inputs
+    if (!SetInputData(opRunner)) {
+        ERROR_LOG("Set input data failed");
+        return false;
+    }
+
+    // Run op
+    if (!opRunner.RunOp()) {
+        ERROR_LOG("Run op failed");
+        return false;
+    }
+
+    // process output data
+    if (!ProcessOutputData(opRunner)) {
+        ERROR_LOG("Process output data failed");
+        return false;
+    }
+
+    INFO_LOG("Run op success");
+    return true;
+}
+
+int main(int argc, char **argv)
+{
+    if (!InitResource()) {
+        ERROR_LOG("Init resource failed");
+        return FAILED;
+    }
+    INFO_LOG("Init resource success");
+
+    if (!RunOp()) {
+        DestoryResource();
+        return FAILED;
+    }
+
+    DestoryResource();
+
+    return SUCCESS;
+}
diff --git a/torch_rec_op/OpTest/inc/aclnn.h b/torch_rec_op/OpTest/inc/aclnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..93562eb0561afa3bff814c57101e60f03a3fa83d
--- /dev/null
+++ b/torch_rec_op/OpTest/inc/aclnn.h
@@ -0,0 +1,25 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+
+#ifndef ACLNN_H
+#define ACLNN_H
+
+
+#if OP_NAME == AsynchronousCompleteCumsum
+#include "aclnn_asynchronous_complete_cumsum.h"
+#endif
+
+
+#endif // ACLNN_H
diff --git a/torch_rec_op/OpTest/inc/common.h b/torch_rec_op/OpTest/inc/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..02da30e3cb6d544849c6f39ee1184d2a79820914
--- /dev/null
+++ b/torch_rec_op/OpTest/inc/common.h
@@ -0,0 +1,50 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+
+#include "acl/acl.h"
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR]  " fmt "\n", ##args)
+
+constexpr int SUCCESS = 0;
+constexpr int FAILED = 1;
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize);
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size);
+
+#endif // COMMON_H
diff --git a/torch_rec_op/OpTest/inc/op_runner.h b/torch_rec_op/OpTest/inc/op_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7c2faca11bc9eb63b729011c401724d4b41a1ea
--- /dev/null
+++ b/torch_rec_op/OpTest/inc/op_runner.h
@@ -0,0 +1,187 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+#ifndef OP_RUNNER_H
+#define OP_RUNNER_H
+
+#include "aclnn/acl_meta.h"
+#include "acl/acl.h"
+#include "common.h"
+#include "operator_desc.h"
+
+/**
+ * Op Runner
+ */
+class OpRunner {
+public:
+    /**
+     * @brief Constructor
+     * @param [in] opDesc: op description
+     */
+    explicit OpRunner(OperatorDesc *opDesc);
+
+    /**
+     * @brief Destructor
+     */
+    virtual ~OpRunner();
+
+    /**
+    * @brief Init op runner
+    */
+    bool Init();
+
+    /**
+     * @brief Get number of inputs
+     * @return number of inputs
+     */
+    const size_t NumInputs();
+
+    /**
+     * @brief Get number of outputs
+     * @return number of outputs
+     */
+    const size_t NumOutputs();
+
+    /**
+     * @brief Get input size by index
+     * @param [in] index: input index
+     * @return size of the input
+     */
+    const size_t GetInputSize(size_t index) const;
+    const size_t GetInputNumDims(size_t index) const;
+    aclDataType GetInputDataType(size_t index) const;
+    aclFormat GetInputFormat(size_t index) const;
+
+    /**
+     * @brief Get output size by index
+     * @param [in] index: output index
+     * @return size of the output
+     */
+    size_t GetOutputSize(size_t index) const;
+    const size_t GetOutputNumDims(size_t index) const;
+    aclDataType GetOutputDataType(size_t index) const;
+    aclFormat GetOutputFormat(size_t index) const;
+
+    /**
+     * @brief Get input element count by index
+     * @param i[in] ndex: input index
+     * @return element count of the input
+     */
+    size_t GetInputElementCount(size_t index) const;
+
+    /**
+     * @brief Get output element count by index
+     * @param [in] index: output index
+     * @return element count of the output
+     */
+    size_t GetOutputElementCount(size_t index) const;
+
+    /**
+     * @brief Get input shape by index
+     * @param [in] index: input index
+     * @return shape of the output
+     */
+    std::vector<int64_t> GetInputShape(size_t index) const;
+
+    /**
+     * @brief Get output shape by index
+     * @param [in] index: output index
+     * @return shape of the output
+     */
+    std::vector<int64_t> GetOutputShape(size_t index) const;
+
+    /**
+     * @brief Get input buffer(host memory) by index
+     * @tparam T: data type
+     * @param [in] index: input index
+     * @return host address of the input
+     */
+    template<typename T>
+    T *GetInputBuffer(size_t index)
+    {
+        if (index >= numInputs_) {
+            ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+            return nullptr;
+        }
+        return reinterpret_cast<T *>(hostInputs_[index]);
+    }
+
+    /**
+     * @brief Get output buffer(host memory) by index
+     * @tparam T: data type
+     * @param [in] index: output index
+     * @return host address of the output
+     */
+    template<typename T>
+    const T *GetOutputBuffer(size_t index)
+    {
+        if (index >= numOutputs_) {
+            ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+            return nullptr;
+        }
+
+        return reinterpret_cast<T *>(hostOutputs_[index]);
+    }
+
+     /**
+      * @brief Print readable input by index
+      * @param [in] index: input index
+      * @param [in] elementsPerRow: number of elements per row
+      */
+    void PrintInput(size_t index, size_t elementsPerRow = 16);
+
+    /**
+      * @brief Print readable output by index
+      * @param [in] index: output index
+      * @param [in] elementsPerRow: number of elements per row
+      */
+    void PrintOutput(size_t index, size_t elementsPerRow = 16);
+
+    /**
+     * @brief Compile static op
+     * @return compile result
+     */
+    bool CompileStaticOp();
+
+    /**
+     * @brief Compile dynamic op
+     * @return compile result
+     */
+    bool CompileDynamicOp();
+
+    /**
+     * @brief Run op
+     * @return run result
+     */
+    bool RunOp();
+
+private:
+    size_t numInputs_;
+    size_t numOutputs_;
+
+    std::vector<aclDataBuffer *> inputBuffers_;
+    std::vector<aclDataBuffer *> outputBuffers_;
+
+    std::vector<void *> devInputs_;
+    std::vector<void *> devOutputs_;
+
+    std::vector<void *> hostInputs_;
+    std::vector<void *> hostOutputs_;
+
+    std::vector<aclTensor *> inputTensor_;
+    std::vector<aclTensor *> outputTensor_;
+    OperatorDesc *opDesc_;
+};
+
+#endif // OP_RUNNER_H
diff --git a/torch_rec_op/OpTest/inc/operator_desc.h b/torch_rec_op/OpTest/inc/operator_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..caea5423900fbdb2808e8fb7e24b962c854cf8d7
--- /dev/null
+++ b/torch_rec_op/OpTest/inc/operator_desc.h
@@ -0,0 +1,62 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+#ifndef OPERATOR_DESC_H
+#define OPERATOR_DESC_H
+
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+/**
+ * Op description
+ */
+struct OperatorDesc {
+    /**
+     * Constructor
+     */
+    explicit OperatorDesc();
+
+    /**
+     * Destructor
+     */
+    virtual ~OperatorDesc();
+
+    /**
+     * Add an input tensor description
+     * @param [in] dataType: data type
+     * @param [in] numDims: number of dims
+     * @param [in] dims: dims
+     * @param [in] format: format
+     * @return OperatorDesc
+     */
+    OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
+
+    /**
+     * Add an output tensor description
+     * @param [in] dataType: data type
+     * @param [in] numDims: number of dims
+     * @param [in] dims: dims
+     * @param [in] format: format
+     * @return OperatorDesc
+     */
+    OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
+
+    std::string opType;
+    std::vector<aclTensorDesc *> inputDesc;
+    std::vector<aclTensorDesc *> outputDesc;
+};
+
+#endif // OPERATOR_DESC_H
diff --git a/torch_rec_op/OpTest/run_test.sh b/torch_rec_op/OpTest/run_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..16a2577677d8f20ad77ddc193024e142d45e80a4
--- /dev/null
+++ b/torch_rec_op/OpTest/run_test.sh
@@ -0,0 +1,133 @@
+
+#!/bin/bash
+
+# 设置环境变量
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+export ASCEND_GLOBAL_LOG_LEVEL=0
+
+if [ ! $ASCEND_HOME_DIR ]; then
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest
+    else
+        export ASCEND_HOME_DIR=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export DDK_PATH=$ASCEND_HOME_DIR
+arch=$(uname -m)
+export NPU_HOST_LIB=$ASCEND_HOME_DIR/${arch}-linux/lib64
+
+WORKER_DIR=$(pwd)
+PARENT_PATH=$(dirname $(pwd))
+cd "$PARENT_PATH"
+
+# 编译算子
+build_op()
+{
+    CURRENT_DIR=$(pwd)
+    bash ./creat.sh
+}
+
+op_test() {
+    # 1. 清除遗留生成文件和日志文件
+    rm -rf $HOME/ascend/log/*
+    rm ./input/*.bin
+    rm ./output/*.bin
+
+    # 2. 生成输入数据和真值数据
+    python3 scripts/gen_data.py
+    if [ $? -ne 0 ]; then
+        echo "ERROR: generate input data failed!"
+        return 1
+    fi
+    echo "INFO: generate input data success!"
+
+
+    # 3. 编译acl可执行文件
+    rm -rf build; mkdir -p build; cd build
+    cmake ../src
+    if [ $? -ne 0 ]; then
+        echo "ERROR: cmake failed!"
+        return 1
+    fi
+    echo "INFO: cmake success!"
+    make
+    if [ $? -ne 0 ]; then
+        echo "ERROR: make failed!"
+        return 1
+    fi
+    echo "INFO: make success!"
+
+    # 4. 运行可执行文件
+    cd ../output
+    echo "INFO: execute op!"
+    ./$exec_op
+
+    if [ $? -ne 0 ]; then
+        echo "ERROR: acl executable run failed! please check your project!"
+        return 1
+    fi
+    echo "INFO: acl executable run success!"
+
+    # 5. 比较真值文件
+    cd ../
+    ret=`python3 scripts/verify_result.py`
+    echo $ret
+    if [ "x$ret" == "xtest pass" ]; then
+        echo ""
+        echo "#####################################"
+        echo "INFO: you have passed the Precision!"
+        echo "#####################################"
+        echo ""
+    fi
+}
+
+to_snake_case(){
+    local input_str=$1
+    echo "$input_str" | sed -E 's/([A-Z])/_\1/g' | sed -E 's/^_//g' | tr 'A-Z' 'a-z'
+}
+
+prefix='aclnn_'
+exec_prefix='execute_'
+exec_suffix='_op'
+test_all_op() {
+    for dir in "$PARENT_PATH"/*; do
+        cd "$PARENT_PATH"
+        if [ -d "$dir" ]; then
+            dir_name=$(basename "$dir")
+            if [ "$dir_name" != "OpTest" ]; then
+                echo "Entering directory: $dir_name"
+                cd "$dir_name"
+                test_dir_name="$prefix$(to_snake_case "$dir_name")"
+                exec_op="$exec_prefix$(to_snake_case "$dir_name")$exec_suffix"
+                echo "test_dir_name: $test_dir_name"
+                build_op
+                cd "$WORKER_DIR"
+                cd "$test_dir_name"
+                op_test
+            fi
+        fi
+    done
+}
+
+TEST_OP_COUNT=$(($(find "$PARENT_PATH" -maxdepth 1 -type d | wc -l) - 2))
+echo -e "\033[32m==============Start to test op, total op count is: $TEST_OP_COUNT=====================\033[0m"
+
+if [ $# -ge 1 ]; then
+    work_dir_name=$1
+    cd "$work_dir_name"
+    test_dir_name="$prefix$(to_snake_case "$work_dir_name")"
+    exec_op="$exec_prefix$(to_snake_case "$work_dir_name")$exec_suffix"
+    build_op
+    cd "$WORKER_DIR"
+    cd "$test_dir_name"
+    op_test
+else
+    test_all_op
+fi
+
+
+
+
+
+
diff --git a/torch_rec_op/OpTest/src/common.cpp b/torch_rec_op/OpTest/src/common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2746d80e7f20d70fbb42dcbc2a0ab68942fc1c6
--- /dev/null
+++ b/torch_rec_op/OpTest/src/common.cpp
@@ -0,0 +1,84 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+#include "common.h"
+
+#include <fstream>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+extern bool g_isDevice;
+
+bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file %s", filePath.c_str());
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    auto writeSize = write(fd, buffer, size);
+    (void) close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
diff --git a/torch_rec_op/OpTest/src/op_runner.cpp b/torch_rec_op/OpTest/src/op_runner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0e77c8b05886710141470419efdd62ef357282a9
--- /dev/null
+++ b/torch_rec_op/OpTest/src/op_runner.cpp
@@ -0,0 +1,498 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+#include "op_runner.h"
+
+#include <limits>
+#include <cassert>
+
+#include <chrono>
+#include "acl/acl_op_compiler.h"
+#include "common.h"
+#include "aclnn.h"
+
+using namespace std;
+
+extern bool g_isDevice;
+
+// 定义宏拼接， 拼接函数名的前缀、后缀
+
+#define _CONCAT_OP_FUNC_FOR_SPACE_SIZE(prefix, name, suffix) prefix##name##suffix
+#define CONCAT_OP_FUNC_FOR_SPACE_SIZE(prefix, name, suffix)  _CONCAT_OP_FUNC_FOR_SPACE_SIZE(prefix, name, suffix)
+
+#define _CONCAT_OP_FUNC(prefix, name) prefix##name
+#define CONCAT_OP_FUNC(prefix, name) _CONCAT_OP_FUNC(prefix, name)
+
+#define INPUT_1 inputTensor_[0]
+#define INPUT_2 inputTensor_[0], inputTensor_[1]
+#define INPUT_3 inputTensor_[0], inputTensor_[1], inputTensor_[2]
+#define INPUT_4 inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3]
+
+#define OUTPUT_1 outputTensor_[0]
+#define OUTPUT_2 outputTensor_[0], outputTensor_[1]
+#define OUTPUT_3 outputTensor_[0], outputTensor_[1], outputTensor_[2]
+#define OUTPUT_4 outputTensor_[0], outputTensor_[1], outputTensor_[2], outputTensor_[3]
+
+#define _INPUT(i) INPUT_##i
+#define _OUTPUT(i) OUTPUT_##i
+
+#define INPUT(i) _INPUT(i)
+#define OUTPUT(i) _OUTPUT(i)
+
+OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc)
+{
+    numInputs_ = opDesc->inputDesc.size();
+    numOutputs_ = opDesc->outputDesc.size();
+}
+
+OpRunner::~OpRunner()
+{
+    for (size_t i = 0; i < numInputs_; ++i) {
+        (void)aclDestroyTensor(inputTensor_[i]);
+        (void)aclDestroyDataBuffer(inputBuffers_[i]);
+        (void)aclrtFree(devInputs_[i]);
+        if (g_isDevice) {
+            (void)aclrtFree(hostInputs_[i]);
+        } else {
+            (void)aclrtFreeHost(hostInputs_[i]);
+        }
+    }
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        (void)aclDestroyTensor(outputTensor_[i]);
+        (void)aclDestroyDataBuffer(outputBuffers_[i]);
+        (void)aclrtFree(devOutputs_[i]);
+        if (g_isDevice) {
+            (void)aclrtFree(hostOutputs_[i]);
+        } else {
+            (void)aclrtFreeHost(hostOutputs_[i]);
+        }
+    }
+}
+
+bool OpRunner::Init()
+{
+    for (size_t i = 0; i < numInputs_; ++i) {
+        auto size = GetInputSize(i);
+        void *devMem = nullptr;
+        if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+            return false;
+        }
+        devInputs_.emplace_back(devMem);
+        inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size));
+
+        void *hostInput = nullptr;
+        if (g_isDevice) {
+            if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+                return false;
+            }
+        } else {
+            if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+                return false;
+            }
+        }
+        if (hostInput == nullptr) {
+            ERROR_LOG("Malloc memory for input[%zu] failed", i);
+            return false;
+        }
+        hostInputs_.emplace_back(hostInput);
+
+        aclTensor *inputTensor = aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i),
+                                                 nullptr, 0, GetInputFormat(i), GetInputShape(i).data(),
+                                                 GetInputNumDims(i), devInputs_[i]);
+        if (inputTensor == nullptr) {
+            ERROR_LOG("Create Tensor for input[%zu] failed", i);
+            return false;
+        }
+        inputTensor_.emplace_back(inputTensor);
+    }
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        auto size = GetOutputSize(i);
+        void *devMem = nullptr;
+        if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+            return false;
+        }
+        devOutputs_.emplace_back(devMem);
+        outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size));
+
+        void *hostOutput = nullptr;
+        if (g_isDevice) {
+            if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+                return false;
+            }
+        } else {
+            if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+                return false;
+            }
+        }
+        if (hostOutput == nullptr) {
+            ERROR_LOG("Malloc host memory for output[%zu] failed", i);
+            return false;
+        }
+        hostOutputs_.emplace_back(hostOutput);
+
+        aclTensor *outputTensor = aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i),
+                                                  nullptr, 0, GetOutputFormat(i), GetOutputShape(i).data(),
+                                                  GetOutputNumDims(i), devOutputs_[i]);
+        if (outputTensor == nullptr) {
+            ERROR_LOG("Create Tensor for output[%zu] failed", i);
+            return false;
+        }
+        outputTensor_.emplace_back(outputTensor);
+    }
+
+    return true;
+}
+
+const size_t OpRunner::NumInputs()
+{
+    return numInputs_;
+}
+
+const size_t OpRunner::NumOutputs()
+{
+    return numOutputs_;
+}
+
+const size_t OpRunner::GetInputSize(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescSize(opDesc_->inputDesc[index]);
+}
+
+const size_t OpRunner::GetInputNumDims(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescNumDims(opDesc_->inputDesc[index]);
+}
+
+aclDataType OpRunner::GetInputDataType(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ACL_DT_UNDEFINED;
+    }
+
+    return aclGetTensorDescType(opDesc_->inputDesc[index]);
+}
+
+aclFormat OpRunner::GetInputFormat(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ACL_FORMAT_UNDEFINED;
+    }
+
+    return aclGetTensorDescFormat(opDesc_->inputDesc[index]);
+}
+
+std::vector<int64_t> OpRunner::GetInputShape(size_t index) const
+{
+    std::vector<int64_t> ret;
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ret;
+    }
+
+    auto desc = opDesc_->inputDesc[index];
+    for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) {
+        int64_t dimSize;
+        if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) {
+            ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i);
+            ret.clear();
+            return ret;
+        }
+        ret.emplace_back(dimSize);
+    }
+
+    return ret;
+}
+
+size_t OpRunner::GetOutputSize(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescSize(opDesc_->outputDesc[index]);
+}
+
+const size_t OpRunner::GetOutputNumDims(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescNumDims(opDesc_->outputDesc[index]);
+}
+
+aclDataType OpRunner::GetOutputDataType(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ACL_DT_UNDEFINED;
+    }
+
+    return aclGetTensorDescType(opDesc_->outputDesc[index]);
+}
+
+
+aclFormat OpRunner::GetOutputFormat(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ACL_FORMAT_UNDEFINED;
+    }
+
+    return aclGetTensorDescFormat(opDesc_->outputDesc[index]);
+}
+
+std::vector<int64_t> OpRunner::GetOutputShape(size_t index) const
+{
+    std::vector<int64_t> ret;
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ret;
+    }
+
+    auto desc = opDesc_->outputDesc[index];
+    for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) {
+        int64_t dimSize;
+        if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) {
+            ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i);
+            ret.clear();
+            return ret;
+        }
+        ret.emplace_back(dimSize);
+    }
+    return ret;
+}
+
+size_t OpRunner::GetInputElementCount(size_t index) const
+{
+    if (index >= opDesc_->inputDesc.size()) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescElementCount(opDesc_->inputDesc[index]);
+}
+
+size_t OpRunner::GetOutputElementCount(size_t index) const
+{
+    if (index >= opDesc_->outputDesc.size()) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescElementCount(opDesc_->outputDesc[index]);
+}
+
+bool OpRunner::RunOp()
+{
+    for (size_t i = 0; i < numInputs_; ++i) {
+        auto size = GetInputSize(i);
+        aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE;
+        if (g_isDevice) {
+            kind = ACL_MEMCPY_DEVICE_TO_DEVICE;
+        }
+        if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) {
+            ERROR_LOG("Copy input[%zu] failed", i);
+            return false;
+        }
+        INFO_LOG("Copy input[%zu] success", i);
+    }
+
+    aclrtStream stream = nullptr;
+    if (aclrtCreateStream(&stream) != ACL_SUCCESS) {
+        ERROR_LOG("Create stream failed");
+        return false;
+    }
+    INFO_LOG("Create stream success");
+
+    size_t workspaceSize = 0;
+    aclOpExecutor *handle = nullptr;
+
+    auto ret = CONCAT_OP_FUNC_FOR_SPACE_SIZE(aclnn, OP_NAME, GetWorkspaceSize)(INPUT(INPUT_NUM), OUTPUT(OUTPUT_NUM),
+                                             &workspaceSize, &handle);
+    if (ret != ACL_SUCCESS) {
+        (void)aclrtDestroyStream(stream);
+        ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast<int32_t>(ret));
+        return false;
+    }
+    INFO_LOG("Execute aclnnAsynchronousCompleteCumsumGetWorkspaceSize success, workspace size %lu", workspaceSize);
+
+    void *workspace = nullptr;
+    if (workspaceSize != 0) {
+        if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory failed");
+        }
+    }
+
+    ret = CONCAT_OP_FUNC(aclnn, OP_NAME)(workspace, workspaceSize, handle, stream);
+    if (ret != ACL_SUCCESS) {
+        (void)aclrtDestroyStream(stream);
+        ERROR_LOG("Execute Operator failed. error code is %d", static_cast<int32_t>(ret));
+        return false;
+    }
+    INFO_LOG("Execute aclnnAsynchronousCompleteCumsum success");
+
+    ret = aclrtSynchronizeStreamWithTimeout(stream, 5000);
+    if (ret != SUCCESS) {
+        ERROR_LOG("Synchronize stream failed. error code is %d", static_cast<int32_t>(ret));
+        (void)aclrtDestroyStream(stream);
+        return false;
+    }
+    INFO_LOG("Synchronize stream success");
+
+    auto beforeTime = std::chrono::steady_clock::now();
+    for (int i = 0; i<100; i++) {
+        ret = CONCAT_OP_FUNC_FOR_SPACE_SIZE(aclnn, OP_NAME, GetWorkspaceSize)(INPUT(INPUT_NUM), OUTPUT(OUTPUT_NUM),
+                                            &workspaceSize, &handle);
+        ret = CONCAT_OP_FUNC(aclnn, OP_NAME)(workspace, workspaceSize, handle, stream);
+    }
+    ret = aclrtSynchronizeStreamWithTimeout(stream, 5000);
+    auto afterTime = std::chrono::steady_clock::now();
+    double duration_microsecond = std::chrono::duration<double, std::micro>(afterTime - beforeTime).count();
+    std::cout << "time cost " << duration_microsecond/100 << " us" << std::endl;
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        auto size = GetOutputSize(i);
+        aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST;
+        if (g_isDevice) {
+            kind = ACL_MEMCPY_DEVICE_TO_DEVICE;
+        }
+        if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) {
+            INFO_LOG("Copy output[%zu] success", i);
+            (void)aclrtDestroyStream(stream);
+            return false;
+        }
+        INFO_LOG("Copy output[%zu] success", i);
+    }
+
+    (void)aclrtDestroyStream(stream);
+    return true;
+}
+
+
+template<typename T>
+void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case ACL_BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT8:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT8:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT16:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT16:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT32:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT32:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT64:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT64:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_FLOAT16:
+            DoPrintFp16Data(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case ACL_FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case ACL_DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+}
+
+void OpRunner::PrintInput(size_t index, size_t numElementsPerRow)
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_);
+        return;
+    }
+
+    auto desc = opDesc_->inputDesc[index];
+    PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow);
+}
+
+void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow)
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return;
+    }
+
+    auto desc = opDesc_->outputDesc[index];
+    PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow);
+}
diff --git a/torch_rec_op/OpTest/src/operator_desc.cpp b/torch_rec_op/OpTest/src/operator_desc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6726ae3ebcbdd2550c0dfdde3204336d583741a0
--- /dev/null
+++ b/torch_rec_op/OpTest/src/operator_desc.cpp
@@ -0,0 +1,60 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+#include "common.h"
+#include "operator_desc.h"
+
+using namespace std;
+
+OperatorDesc::OperatorDesc() {}
+
+OperatorDesc::~OperatorDesc()
+{
+    for (auto *desc : inputDesc) {
+        aclDestroyTensorDesc(desc);
+    }
+
+    for (auto *desc : outputDesc) {
+        aclDestroyTensorDesc(desc);
+    }
+}
+
+OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType,
+                                               int numDims,
+                                               const int64_t *dims,
+                                               aclFormat format)
+{
+    aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format);
+    if (desc == nullptr) {
+        ERROR_LOG("create tensor failed");
+        return *this;
+    }
+    inputDesc.emplace_back(desc);
+    return *this;
+}
+
+OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType,
+                                                int numDims,
+                                                const int64_t *dims,
+                                                aclFormat format)
+{
+    aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format);
+    if (desc == nullptr) {
+        ERROR_LOG("create tensor failed");
+        return *this;
+    }
+
+    outputDesc.emplace_back(desc);
+    return *this;
+}