diff --git a/torch_rec_op/AsynchronousCompleteCumsum/README.md b/torch_rec_op/AsynchronousCompleteCumsum/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e16c330ab6ba92f0a9ad142db7514be7422729df --- /dev/null +++ b/torch_rec_op/AsynchronousCompleteCumsum/README.md @@ -0,0 +1,114 @@ +## 目录结构 +| 目录 | 描述 | +|---------------------|----------------------| +| matmul_leakyrelu_custom.json | MatmulLeakyReluCustom算子的原型定义json文件 | +| [MatmulLeakyReluCustom](./MatmulLeakyReluCustom) | MatmulLeakyReluCustom算子工程 | +| [AclNNInvocation](./AclNNInvocation) | 通过aclnn调用的方式调用MatmulLeakyReluCustom算子工程。 | + +## 编译算子工程部署算子包 + +### 1.获取源码包 + + 可以使用以下两种方式下载,请选择其中一种进行源码准备。 + + - 命令行方式下载(下载时间较长,但步骤简单)。 + + ``` + # 开发环境,非root用户命令行中执行以下命令下载源码仓。 + cd ${HOME} + git clone https://gitee.com/ascend/samples.git + ``` + **注:如果需要切换到其它tag版本,以v0.5.0为例,可执行以下命令。** + ``` + git checkout v0.5.0 + ``` + - 压缩包方式下载(下载时间较短,但步骤稍微复杂)。 + **注:如果需要下载其它版本代码,请先请根据前置条件说明进行samples仓分支切换。** + ``` + # 1. samples仓右上角选择 【克隆/下载】 下拉框并选择 【下载ZIP】。 + # 2. 将ZIP包上传到开发环境中的普通用户家目录中,【例如:${HOME}/ascend-samples-master.zip】。 + # 3. 开发环境中,执行以下命令,解压zip包。 + cd ${HOME} + unzip ascend-samples-master.zip + ``` + +### 2.编译算子工程 + + 编译自定义算子工程,构建生成自定义算子包 + + - 执行如下命令,切换到算子工程MatmulLeakyReluCustom目录 + + ``` + cd $HOME/samples/operator/MatMulLeakyReluCustomSample/FrameworkLaunch/MatmulLeakyReluCustom + ``` + + - 修改CMakePresets.json中ASCEND_CANN_PACKAGE_PATH为CANN软件包安装后的实际路径。 + + + ``` + { + …… + "configurePresets": [ + { + …… + "ASCEND_CANN_PACKAGE_PATH": { + "type": "PATH", + "value": "~/Ascend/ascend-toolkit/latest" //请替换为CANN软件包安装后的实际路径。eg:/home/HwHiAiUser/Ascend/ascend-toolkit/latest + }, + …… + } + ] + } + ``` + - 在算子工程MatmulLeakyReluCustom目录下执行如下命令,进行算子工程编译。 + + ``` + ./build.sh + ``` + 编译成功后,会在当前目录下创建build_out目录,并在build_out目录下生成自定义算子安装包custom_opp__.run,例如“custom_opp_ubuntu_x86_64.run”。 + + +### 3.部署算子包 + + - 执行如下命令,在自定义算子安装包所在路径下,安装自定义算子包。 + + ``` + cd build_out + ./custom_opp__.run + ``` + + 命令执行成功后,自定义算子包中的相关文件将部署至当前环境的OPP算子库的vendors/customize目录中。 + +## 配置环境变量 + + 这里的\$HOME需要替换为CANN包的安装路径。 + ``` + export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest + ``` + +## 通过aclnn调用的方式调用MatmulLeakyReluCustom算子工程 + +### 样例运行 + + - 进入到样例目录 + + ``` + cd $HOME/samples/operator/MatmulLeakyReluCustomSample/FrameworkLaunch/AclNNInvocation + ``` + + - 样例执行 + + 样例执行过程中会自动生成测试数据,然后编译与运行aclnn样例,最后检验运行结果。具体过程可参见run.sh脚本。 + ``` + bash run.sh + ``` + +## 更新说明 + | 时间 | 更新事项 | +|----|------| +| 2023/11/9 | 新增AclNNInvocation样例 | + + +## 已知issue + + 暂无 \ No newline at end of file diff --git a/torch_rec_op/AsynchronousCompleteCumsum/asynchronous_complete_cumsum.json b/torch_rec_op/AsynchronousCompleteCumsum/asynchronous_complete_cumsum.json new file mode 100644 index 0000000000000000000000000000000000000000..3d282b0234e76f83c14215796b0003b58a6df61f --- /dev/null +++ b/torch_rec_op/AsynchronousCompleteCumsum/asynchronous_complete_cumsum.json @@ -0,0 +1,30 @@ +[ + { + "op": "AsynchronousCompleteCumsum", + "language": "cpp", + "input_desc": [ + { + "name": "x", + "param_type": "required", + "format": [ + "ND", "ND" + ], + "type": [ + "int64", "int32" + ] + } + ], + "output_desc": [ + { + "name": "y", + "param_type": "required", + "format": [ + "ND", "ND" + ], + "type": [ + "int64", "int32" + ] + } + ] + } +] \ No newline at end of file diff --git a/torch_rec_op/AsynchronousCompleteCumsum/clear_all.sh b/torch_rec_op/AsynchronousCompleteCumsum/clear_all.sh new file mode 100644 index 0000000000000000000000000000000000000000..c8c732744cef69affbfeb52ed562088bd8d35083 --- /dev/null +++ b/torch_rec_op/AsynchronousCompleteCumsum/clear_all.sh @@ -0,0 +1,4 @@ +rm -rf asynchronous_complete_cumsum +rm -rf aclnn_asynchronous_complete_cumsum/build +rm -rf aclnn_asynchronous_complete_cumsum/input +rm -rf aclnn_asynchronous_complete_cumsum/output \ No newline at end of file diff --git a/torch_rec_op/AsynchronousCompleteCumsum/creat.sh b/torch_rec_op/AsynchronousCompleteCumsum/creat.sh new file mode 100644 index 0000000000000000000000000000000000000000..0763f5c665f8399dc652805448630be8083b4f36 --- /dev/null +++ b/torch_rec_op/AsynchronousCompleteCumsum/creat.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e + + +# 查找msopgen的路径,加入到环境变量PATH中 +msopgen_path=$(find /usr/local/Ascend/ -name msopgen | grep bin) +parent_dir=$(dirname "$msopgen_path") +export PATH=$parent_dir:$PATH + +# 利用msopgen生成可编译文件 +rm -rf ./asynchronous_complete_cumsum +python3 /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/bin/msopgen gen -i asynchronous_complete_cumsum.json -f tf -c ai_core-Ascend910 -lan cpp -out ./asynchronous_complete_cumsum -m 0 -op AsynchronousCompleteCumsum +rm -rf asynchronous_complete_cumsum/op_kernel +rm -rf asynchronous_complete_cumsum/host +cp -rf op_kernel asynchronous_complete_cumsum/ +cp -rf op_host asynchronous_complete_cumsum/ +cd asynchronous_complete_cumsum + +# 判断当前目录下是否存在CMakePresets.json文件 +if [ ! -f "CMakePresets.json" ]; then + echo "ERROR, CMakePresets.json file not exist." + exit 1 +fi + +# 禁止生成CRC校验和 +sed -i 's/--nomd5/--nomd5 --nocrc/g' ./cmake/makeself.cmake + +# 修改cann安装路径 +sed -i 's:"/usr/local/Ascend/latest":"/usr/local/Ascend/ascend-toolkit/latest":g' CMakePresets.json +# 修改vendor_name 防止覆盖之前vendor_name为customize的算子; +# vendor_name需要和aclnn中的CMakeLists.txt中的CUST_PKG_PATH值同步,不同步aclnn会调用失败; +# vendor_name字段值不能包含customize;包含会导致多算子部署场景CANN的vendors路径下config.ini文件内容截取错误 +sed -i 's:"customize":"asynchronous_complete_cumsum":g' CMakePresets.json + +bash build.sh + +# # 安装编译成功的算子包 +bash ./build_out/custom_opp*.run diff --git a/torch_rec_op/AsynchronousCompleteCumsum/op_host/CMakeLists.txt b/torch_rec_op/AsynchronousCompleteCumsum/op_host/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2975907ef6c898778223103de7880a64760494d3 --- /dev/null +++ b/torch_rec_op/AsynchronousCompleteCumsum/op_host/CMakeLists.txt @@ -0,0 +1,96 @@ +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs) + +opbuild(OPS_SRC ${ops_srcs} + OUT_DIR ${ASCEND_AUTOGEN_PATH} +) + +add_library(cust_op_proto SHARED ${ops_srcs} ${ASCEND_AUTOGEN_PATH}/op_proto.cc) +target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB) +target_compile_options(cust_op_proto PRIVATE + -fvisibility=hidden +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_op_proto PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_op_proto PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME + cust_opsproto_rt2.0 +) +add_library(cust_optiling SHARED ${ops_srcs}) +target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) +target_compile_options(cust_optiling PRIVATE + -fvisibility=hidden +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_optiling PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_optiling PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME + cust_opmaster_rt2.0 +) + +file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) +file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h) +add_library(cust_opapi SHARED ${aclnn_src}) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_opapi PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase) + +add_custom_target(optiling_compat ALL + COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$ + ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so +) + +install(TARGETS cust_op_proto + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) +install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h + DESTINATION packages/vendors/${vendor_name}/op_proto/inc) +install(TARGETS cust_optiling + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling) +install(TARGETS cust_opapi + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib) +install(FILES ${aclnn_inc} + DESTINATION packages/vendors/${vendor_name}/op_api/include) diff --git a/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum.cpp b/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d9209f3711e133a571bed6002585cf0a7cdd6bbf --- /dev/null +++ b/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum.cpp @@ -0,0 +1,97 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include "asynchronous_complete_cumsum_tiling.h" +#include "register/op_def_registry.h" + +namespace { + constexpr int32_t EMBEDDING_TYPE_INT64 = 0; + constexpr int32_t EMBEDDING_TYPE_INT32 = 1; +} + +namespace optiling { + static ge::graphStatus TilingFunc(gert::TilingContext* context) + { + AsynchronousCompleteCumsumTilingData tiling; + auto inputTensor0 = context->GetInputTensor(0); + uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize(); + uint32_t dimNum = context->GetInputShape(0)->GetOriginShape().GetDimNum(); + + ge::DataType inputDatatype = inputTensor0->GetDataType(); + uint32_t embeddingType; + if (inputDatatype == ge::DT_INT64) { + embeddingType = EMBEDDING_TYPE_INT64; + } else if (inputDatatype == ge::DT_INT32) { + embeddingType = EMBEDDING_TYPE_INT32; + } + + tiling.set_totalLength(totalLength); + tiling.set_dimNum(dimNum); + tiling.set_inputType(embeddingType); + + if (dimNum != 1) { + printf("AsynchronousCompleteCumsum required the dim of input-0 is 1 but %ld ", dimNum); + return ge::FAILED; + } + context->SetBlockDim(1); //? 这里的block设置 + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + + return ge::GRAPH_SUCCESS; + } +} + + +namespace ge { + static ge::graphStatus InferShape(gert::InferShapeContext* context) + { + const gert::Shape* xShape = context->GetInputShape(0); + gert::Shape* yShape = context->GetOutputShape(0); + + int64_t inputLength = xShape->GetShapeSize(); + yShape->SetDim(0, inputLength+1); + return GRAPH_SUCCESS; + } +} + + +namespace ops { + class AsynchronousCompleteCumsum : public OpDef { + public: + explicit AsynchronousCompleteCumsum(const char* name) : OpDef(name) + { + this->Input("x") + .ParamType(REQUIRED) + .DataType({ge::DT_INT64, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); + this->Output("y") + .ParamType(REQUIRED) + .DataType({ge::DT_INT64, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); + + this->SetInferShape(ge::InferShape); + + this->AICore() + .SetTiling(optiling::TilingFunc); + + this->AICore().AddConfig("ascend910b"); + this->AICore().AddConfig("ascend910"); + } + }; + + OP_ADD(AsynchronousCompleteCumsum); +} diff --git a/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum_tiling.h b/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..cb3c27e5bbd08d44d108a2ddba90802458a5a99b --- /dev/null +++ b/torch_rec_op/AsynchronousCompleteCumsum/op_host/asynchronous_complete_cumsum_tiling.h @@ -0,0 +1,30 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef ASYNCHRONOUS_COMPLETE_CUMSUM_H +#define ASYNCHRONOUS_COMPLETE_CUMSUM_H +#include "register/tilingdata_base.h" + +namespace optiling { + BEGIN_TILING_DATA_DEF(AsynchronousCompleteCumsumTilingData) + TILING_DATA_FIELD_DEF(uint32_t, totalLength); // 处理数据的总长度 + TILING_DATA_FIELD_DEF(uint32_t, dimNum); // 数据的维度 + TILING_DATA_FIELD_DEF(uint32_t, inputType); // 数据的类型 + END_TILING_DATA_DEF; + + REGISTER_TILING_DATA_CLASS(AsynchronousCompleteCumsum, AsynchronousCompleteCumsumTilingData) +} + +#endif // ASYNCHRONOUS_COMPLETE_CUMSUM_H diff --git a/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/CMakeLists.txt b/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c94a952da2058b785affa6784d78b87dfe9b3d7 --- /dev/null +++ b/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/CMakeLists.txt @@ -0,0 +1,68 @@ +# set custom compile options +if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") + add_ops_compile_options(ALL OPTIONS -g -O0) +endif() + +foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) + + # generate aic-${compute_unit}-ops-info.json + add_ops_info_target(TARGET ops_info_gen_${compute_unit} + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit} + ) + + # generate ascendc impl py once + if (NOT TARGET ascendc_impl_gen) + add_ops_impl_target(TARGET ascendc_impl_gen + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR} + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl + ) + endif() + + # dynamic shape binary compile + if (${ENABLE_BINARY_PACKAGE} AND NOT ${ENABLE_CROSS_COMPILE}) + add_bin_compile_target(TARGET ascendc_bin_${compute_unit} + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR} + ADP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/dynamic + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel + COMPUTE_UNIT ${compute_unit} + ) + add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen) + endif() + + if (${ENABLE_CROSS_COMPILE} AND ${ENABLE_BINARY_PACKAGE}) + add_cross_compile_target( + TARGET bin_${compute_unit} + OUT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../kernel + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/ + ) + endif() +endforeach() + +# generate npu_supported_ops.json +add_npu_support_target(TARGET npu_supported_ops + OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH} + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core + INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE} +) + +if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) + add_subdirectory(testcases) +endif() + +# install kernel file +if (${ENABLE_SOURCE_PACKAGE}) + file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h + ${CMAKE_CURRENT_SOURCE_DIR}/*.py + ) + install(FILES ${KERNEL_FILES} + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic + ) +endif() diff --git a/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/asynchronous_complete_cumsum.cpp b/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/asynchronous_complete_cumsum.cpp new file mode 100644 index 0000000000000000000000000000000000000000..60bc4e7455eeb351b49e1c5bdd90d43077dd5d71 --- /dev/null +++ b/torch_rec_op/AsynchronousCompleteCumsum/op_kernel/asynchronous_complete_cumsum.cpp @@ -0,0 +1,35 @@ +#include "kernel_operator.h" + +using namespace AscendC; + +extern "C" __global__ __aicore__ void asynchronous_complete_cumsum(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, + GM_ADDR tiling) { + GET_TILING_DATA(tiling_data, tiling); + int64_t totalLen = tiling_data.totalLength; + uint32_t inputType = tiling_data.inputType; + + switch (inputType) + { + case 0 : + { + __gm__ int64_t* xPtr = (__gm__ int64_t*) x; + __gm__ int64_t* yPtr = (__gm__ int64_t*) y; + *(yPtr) = 0; + for (int i=0; i ", real_result_file, real_result[:129]) + print("golden result, example-> ", golden_file, golden[:129]) + + result = np.abs(real_result - golden) + deno = np.maximum(np.abs(real_result), np.abs(golden)) + result_atol = np.less_equal(result, loss) + result_rtol = np.less_equal(result / np.add(deno, minimum), loss) + if not result_rtol.all() and not result_atol.all(): + if np.sum(result_rtol == False) > real_result.size * loss and np.sum(result_atol == False) > real_result.size * loss: + print("[ERROR] result error") + return False + print("test pass") + return True + +if __name__ == '__main__': + print("=============================x============") + verify_result("./output/y.bin", "./output/golden.bin") \ No newline at end of file diff --git a/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/CMakeLists.txt b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fb7e9e0bc9a0bb4893841ea4954301f69bad919 --- /dev/null +++ b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/CMakeLists.txt @@ -0,0 +1,84 @@ +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_asynchronous_complete_cumsum) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/asynchronous_complete_cumsum/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub/") + set(LIB_PATH1 "/usr/local/Ascend/ascend-toolkit/latest/atc/lib64/stub/") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ${INC_PATH}/runtime/include + ${INC_PATH}/atc/include + ../../inc + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${LIB_PATH1} + ${CUST_PKG_PATH}/lib +) + +add_definitions(-DOP_NAME=AsynchronousCompleteCumsum) +add_definitions(-DINPUT_NUM=1) +add_definitions(-DOUTPUT_NUM=1) +add_executable(execute_asynchronous_complete_cumsum_op + ../../src/operator_desc.cpp + ../../src/op_runner.cpp + main.cpp + ../../src/op_runner.cpp + ../../src/ ../../src/common.cpp +) + +target_link_libraries(execute_asynchronous_complete_cumsum_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_asynchronous_complete_cumsum_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/main.cpp b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9dcfcbe8f4d15cead0152450d394a38f7ea12702 --- /dev/null +++ b/torch_rec_op/OpTest/aclnn_asynchronous_complete_cumsum/src/main.cpp @@ -0,0 +1,171 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "op_runner.h" + +#include "../../inc/common.h" + +bool g_isDevice = false; +int deviceId = 0; + +OperatorDesc CreateOpDesc() +{ + // define operator + std::vector x_shape { 128 }; + std::vector y_shape { 129 }; + aclFormat format = ACL_FORMAT_ND; + + OperatorDesc opDesc; + opDesc.AddInputTensorDesc(ACL_INT64, x_shape.size(), x_shape.data(), format); + opDesc.AddOutputTensorDesc(ACL_INT64, y_shape.size(), y_shape.data(), format); + return opDesc; +} + +bool SetInputData(OpRunner &runner) +{ + size_t fileSize = 0; + ReadFile("../input/input_x.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0)); + INFO_LOG("Set input success"); + return true; +} + +bool ProcessOutputData(OpRunner &runner) +{ + WriteFile("../output/y.bin", runner.GetOutputBuffer(0), runner.GetOutputSize(0)); + INFO_LOG("Write output success"); + return true; +} + +void DestoryResource() +{ + bool flag = false; + if (aclrtResetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destory resource failed"); + } else { + INFO_LOG("Destory resource success"); + } +} + +bool InitResource() +{ + // 创建输出目录output + std::string output = "../output"; + if (access(output.c_str(), 0) == -1) { + int ret = mkdir(output.c_str(), 0700); + if (ret == 0) { + INFO_LOG("Make output directory successfully"); + } + else { + ERROR_LOG("Make output directory fail"); + return false; + } + } + + // acl.json is dump or profiling config file + // aclInit接口 -> 初始化AscendCL (一个进程里面只能调用一次)。init里面的传参是dump\profiling config file, 可以不传 + if (aclInit(NULL) != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } + + // 运行管理资源申请 + if (aclrtSetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. deviceId is %d", deviceId); + (void)aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + // 获取当前的RunMode + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestoryResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; +} + +bool RunOp() +{ + // create op desc + OperatorDesc opDesc = CreateOpDesc(); + + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } + + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } + + // Run op + if (!opRunner.RunOp()) { + ERROR_LOG("Run op failed"); + return false; + } + + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + + INFO_LOG("Run op success"); + return true; +} + +int main(int argc, char **argv) +{ + if (!InitResource()) { + ERROR_LOG("Init resource failed"); + return FAILED; + } + INFO_LOG("Init resource success"); + + if (!RunOp()) { + DestoryResource(); + return FAILED; + } + + DestoryResource(); + + return SUCCESS; +} diff --git a/torch_rec_op/OpTest/inc/aclnn.h b/torch_rec_op/OpTest/inc/aclnn.h new file mode 100644 index 0000000000000000000000000000000000000000..93562eb0561afa3bff814c57101e60f03a3fa83d --- /dev/null +++ b/torch_rec_op/OpTest/inc/aclnn.h @@ -0,0 +1,25 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef ACLNN_H +#define ACLNN_H + + +#if OP_NAME == AsynchronousCompleteCumsum +#include "aclnn_asynchronous_complete_cumsum.h" +#endif + + +#endif // ACLNN_H diff --git a/torch_rec_op/OpTest/inc/common.h b/torch_rec_op/OpTest/inc/common.h new file mode 100644 index 0000000000000000000000000000000000000000..02da30e3cb6d544849c6f39ee1184d2a79820914 --- /dev/null +++ b/torch_rec_op/OpTest/inc/common.h @@ -0,0 +1,50 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + +constexpr int SUCCESS = 0; +constexpr int FAILED = 1; + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size); + +#endif // COMMON_H diff --git a/torch_rec_op/OpTest/inc/op_runner.h b/torch_rec_op/OpTest/inc/op_runner.h new file mode 100644 index 0000000000000000000000000000000000000000..e7c2faca11bc9eb63b729011c401724d4b41a1ea --- /dev/null +++ b/torch_rec_op/OpTest/inc/op_runner.h @@ -0,0 +1,187 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ +#ifndef OP_RUNNER_H +#define OP_RUNNER_H + +#include "aclnn/acl_meta.h" +#include "acl/acl.h" +#include "common.h" +#include "operator_desc.h" + +/** + * Op Runner + */ +class OpRunner { +public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + const size_t GetInputNumDims(size_t index) const; + aclDataType GetInputDataType(size_t index) const; + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + const size_t GetOutputNumDims(size_t index) const; + aclDataType GetOutputDataType(size_t index) const; + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template + T *GetInputBuffer(size_t index) + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); + } + + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template + const T *GetOutputBuffer(size_t index) + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); + } + + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(); + +private: + size_t numInputs_; + size_t numOutputs_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; +}; + +#endif // OP_RUNNER_H diff --git a/torch_rec_op/OpTest/inc/operator_desc.h b/torch_rec_op/OpTest/inc/operator_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..caea5423900fbdb2808e8fb7e24b962c854cf8d7 --- /dev/null +++ b/torch_rec_op/OpTest/inc/operator_desc.h @@ -0,0 +1,62 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ +#ifndef OPERATOR_DESC_H +#define OPERATOR_DESC_H + +#include +#include + +#include "acl/acl.h" + +/** + * Op description + */ +struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); + + /** + * Destructor + */ + virtual ~OperatorDesc(); + + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + std::string opType; + std::vector inputDesc; + std::vector outputDesc; +}; + +#endif // OPERATOR_DESC_H diff --git a/torch_rec_op/OpTest/run_test.sh b/torch_rec_op/OpTest/run_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..16a2577677d8f20ad77ddc193024e142d45e80a4 --- /dev/null +++ b/torch_rec_op/OpTest/run_test.sh @@ -0,0 +1,133 @@ + +#!/bin/bash + +# 设置环境变量 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=0 + +if [ ! $ASCEND_HOME_DIR ]; then + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest + else + export ASCEND_HOME_DIR=/usr/local/Ascend/ascend-toolkit/latest + fi +fi + +export DDK_PATH=$ASCEND_HOME_DIR +arch=$(uname -m) +export NPU_HOST_LIB=$ASCEND_HOME_DIR/${arch}-linux/lib64 + +WORKER_DIR=$(pwd) +PARENT_PATH=$(dirname $(pwd)) +cd "$PARENT_PATH" + +# 编译算子 +build_op() +{ + CURRENT_DIR=$(pwd) + bash ./creat.sh +} + +op_test() { + # 1. 清除遗留生成文件和日志文件 + rm -rf $HOME/ascend/log/* + rm ./input/*.bin + rm ./output/*.bin + + # 2. 生成输入数据和真值数据 + python3 scripts/gen_data.py + if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 + fi + echo "INFO: generate input data success!" + + + # 3. 编译acl可执行文件 + rm -rf build; mkdir -p build; cd build + cmake ../src + if [ $? -ne 0 ]; then + echo "ERROR: cmake failed!" + return 1 + fi + echo "INFO: cmake success!" + make + if [ $? -ne 0 ]; then + echo "ERROR: make failed!" + return 1 + fi + echo "INFO: make success!" + + # 4. 运行可执行文件 + cd ../output + echo "INFO: execute op!" + ./$exec_op + + if [ $? -ne 0 ]; then + echo "ERROR: acl executable run failed! please check your project!" + return 1 + fi + echo "INFO: acl executable run success!" + + # 5. 比较真值文件 + cd ../ + ret=`python3 scripts/verify_result.py` + echo $ret + if [ "x$ret" == "xtest pass" ]; then + echo "" + echo "#####################################" + echo "INFO: you have passed the Precision!" + echo "#####################################" + echo "" + fi +} + +to_snake_case(){ + local input_str=$1 + echo "$input_str" | sed -E 's/([A-Z])/_\1/g' | sed -E 's/^_//g' | tr 'A-Z' 'a-z' +} + +prefix='aclnn_' +exec_prefix='execute_' +exec_suffix='_op' +test_all_op() { + for dir in "$PARENT_PATH"/*; do + cd "$PARENT_PATH" + if [ -d "$dir" ]; then + dir_name=$(basename "$dir") + if [ "$dir_name" != "OpTest" ]; then + echo "Entering directory: $dir_name" + cd "$dir_name" + test_dir_name="$prefix$(to_snake_case "$dir_name")" + exec_op="$exec_prefix$(to_snake_case "$dir_name")$exec_suffix" + echo "test_dir_name: $test_dir_name" + build_op + cd "$WORKER_DIR" + cd "$test_dir_name" + op_test + fi + fi + done +} + +TEST_OP_COUNT=$(($(find "$PARENT_PATH" -maxdepth 1 -type d | wc -l) - 2)) +echo -e "\033[32m==============Start to test op, total op count is: $TEST_OP_COUNT=====================\033[0m" + +if [ $# -ge 1 ]; then + work_dir_name=$1 + cd "$work_dir_name" + test_dir_name="$prefix$(to_snake_case "$work_dir_name")" + exec_op="$exec_prefix$(to_snake_case "$work_dir_name")$exec_suffix" + build_op + cd "$WORKER_DIR" + cd "$test_dir_name" + op_test +else + test_all_op +fi + + + + + + diff --git a/torch_rec_op/OpTest/src/common.cpp b/torch_rec_op/OpTest/src/common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d2746d80e7f20d70fbb42dcbc2a0ab68942fc1c6 --- /dev/null +++ b/torch_rec_op/OpTest/src/common.cpp @@ -0,0 +1,84 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ +#include "common.h" + +#include +#include +#include +#include + +extern bool g_isDevice; + +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} diff --git a/torch_rec_op/OpTest/src/op_runner.cpp b/torch_rec_op/OpTest/src/op_runner.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0e77c8b05886710141470419efdd62ef357282a9 --- /dev/null +++ b/torch_rec_op/OpTest/src/op_runner.cpp @@ -0,0 +1,498 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ +#include "op_runner.h" + +#include +#include + +#include +#include "acl/acl_op_compiler.h" +#include "common.h" +#include "aclnn.h" + +using namespace std; + +extern bool g_isDevice; + +// 定义宏拼接, 拼接函数名的前缀、后缀 + +#define _CONCAT_OP_FUNC_FOR_SPACE_SIZE(prefix, name, suffix) prefix##name##suffix +#define CONCAT_OP_FUNC_FOR_SPACE_SIZE(prefix, name, suffix) _CONCAT_OP_FUNC_FOR_SPACE_SIZE(prefix, name, suffix) + +#define _CONCAT_OP_FUNC(prefix, name) prefix##name +#define CONCAT_OP_FUNC(prefix, name) _CONCAT_OP_FUNC(prefix, name) + +#define INPUT_1 inputTensor_[0] +#define INPUT_2 inputTensor_[0], inputTensor_[1] +#define INPUT_3 inputTensor_[0], inputTensor_[1], inputTensor_[2] +#define INPUT_4 inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3] + +#define OUTPUT_1 outputTensor_[0] +#define OUTPUT_2 outputTensor_[0], outputTensor_[1] +#define OUTPUT_3 outputTensor_[0], outputTensor_[1], outputTensor_[2] +#define OUTPUT_4 outputTensor_[0], outputTensor_[1], outputTensor_[2], outputTensor_[3] + +#define _INPUT(i) INPUT_##i +#define _OUTPUT(i) OUTPUT_##i + +#define INPUT(i) _INPUT(i) +#define OUTPUT(i) _OUTPUT(i) + +OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) +{ + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); +} + +OpRunner::~OpRunner() +{ + for (size_t i = 0; i < numInputs_; ++i) { + (void)aclDestroyTensor(inputTensor_[i]); + (void)aclDestroyDataBuffer(inputBuffers_[i]); + (void)aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostInputs_[i]); + } else { + (void)aclrtFreeHost(hostInputs_[i]); + } + } + + for (size_t i = 0; i < numOutputs_; ++i) { + (void)aclDestroyTensor(outputTensor_[i]); + (void)aclDestroyDataBuffer(outputBuffers_[i]); + (void)aclrtFree(devOutputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostOutputs_[i]); + } else { + (void)aclrtFreeHost(hostOutputs_[i]); + } + } +} + +bool OpRunner::Init() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); + + aclTensor *inputTensor = aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), + nullptr, 0, GetInputFormat(i), GetInputShape(i).data(), + GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); + } + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + devOutputs_.emplace_back(devMem); + outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + + aclTensor *outputTensor = aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i), + nullptr, 0, GetOutputFormat(i), GetOutputShape(i).data(), + GetOutputNumDims(i), devOutputs_[i]); + if (outputTensor == nullptr) { + ERROR_LOG("Create Tensor for output[%zu] failed", i); + return false; + } + outputTensor_.emplace_back(outputTensor); + } + + return true; +} + +const size_t OpRunner::NumInputs() +{ + return numInputs_; +} + +const size_t OpRunner::NumOutputs() +{ + return numOutputs_; +} + +const size_t OpRunner::GetInputSize(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->inputDesc[index]); +} + +const size_t OpRunner::GetInputNumDims(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +} + +aclDataType OpRunner::GetInputDataType(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->inputDesc[index]); +} + +aclFormat OpRunner::GetInputFormat(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); +} + +std::vector OpRunner::GetInputShape(size_t index) const +{ + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } + + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + + return ret; +} + +size_t OpRunner::GetOutputSize(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->outputDesc[index]); +} + +const size_t OpRunner::GetOutputNumDims(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); +} + +aclDataType OpRunner::GetOutputDataType(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->outputDesc[index]); +} + + +aclFormat OpRunner::GetOutputFormat(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +} + +std::vector OpRunner::GetOutputShape(size_t index) const +{ + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; + } + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetInputElementCount(size_t index) const +{ + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +} + +size_t OpRunner::GetOutputElementCount(size_t index) const +{ + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); +} + +bool OpRunner::RunOp() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); + } + + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor *handle = nullptr; + + auto ret = CONCAT_OP_FUNC_FOR_SPACE_SIZE(aclnn, OP_NAME, GetWorkspaceSize)(INPUT(INPUT_NUM), OUTPUT(OUTPUT_NUM), + &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAsynchronousCompleteCumsumGetWorkspaceSize success, workspace size %lu", workspaceSize); + + void *workspace = nullptr; + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } + + ret = CONCAT_OP_FUNC(aclnn, OP_NAME)(workspace, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAsynchronousCompleteCumsum success"); + + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + auto beforeTime = std::chrono::steady_clock::now(); + for (int i = 0; i<100; i++) { + ret = CONCAT_OP_FUNC_FOR_SPACE_SIZE(aclnn, OP_NAME, GetWorkspaceSize)(INPUT(INPUT_NUM), OUTPUT(OUTPUT_NUM), + &workspaceSize, &handle); + ret = CONCAT_OP_FUNC(aclnn, OP_NAME)(workspace, workspaceSize, handle, stream); + } + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + auto afterTime = std::chrono::steady_clock::now(); + double duration_microsecond = std::chrono::duration(afterTime - beforeTime).count(); + std::cout << "time cost " << duration_microsecond/100 << " us" << std::endl; + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } + + (void)aclrtDestroyStream(stream); + return true; +} + + +template +void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } +} + +void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; + } + + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} + +void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; + } + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} diff --git a/torch_rec_op/OpTest/src/operator_desc.cpp b/torch_rec_op/OpTest/src/operator_desc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6726ae3ebcbdd2550c0dfdde3204336d583741a0 --- /dev/null +++ b/torch_rec_op/OpTest/src/operator_desc.cpp @@ -0,0 +1,60 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ +#include "common.h" +#include "operator_desc.h" + +using namespace std; + +OperatorDesc::OperatorDesc() {} + +OperatorDesc::~OperatorDesc() +{ + for (auto *desc : inputDesc) { + aclDestroyTensorDesc(desc); + } + + for (auto *desc : outputDesc) { + aclDestroyTensorDesc(desc); + } +} + +OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, + int numDims, + const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + inputDesc.emplace_back(desc); + return *this; +} + +OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, + int numDims, + const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + + outputDesc.emplace_back(desc); + return *this; +}