diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/inc/common.h b/cust_op/attention_fusion/aclnn_attention_fusion/inc/common.h new file mode 100644 index 0000000000000000000000000000000000000000..7036813a972219f046e079072fe2e1ed94670e5f --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/inc/common.h @@ -0,0 +1,45 @@ +/** +* @file common.h +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +#define SUCCESS 0 +#define FAILED 1 + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size); + +#endif // COMMON_H diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/inc/op_runner.h b/cust_op/attention_fusion/aclnn_attention_fusion/inc/op_runner.h new file mode 100644 index 0000000000000000000000000000000000000000..bf2a9ef43d12950dae6f19b98d48f5a6a36ac02f --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/inc/op_runner.h @@ -0,0 +1,182 @@ +/** +* @file op_runner.h +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#ifndef OP_RUNNER_H +#define OP_RUNNER_H + +#include "aclnn/acl_meta.h" +#include "acl/acl.h" +#include "common.h" +#include "operator_desc.h" + +/** + * Op Runner + */ +class OpRunner { +public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + const size_t GetInputNumDims(size_t index) const; + aclDataType GetInputDataType(size_t index) const; + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + const size_t GetOutputNumDims(size_t index) const; + aclDataType GetOutputDataType(size_t index) const; + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template + T *GetInputBuffer(size_t index) + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); + } + + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template + const T *GetOutputBuffer(size_t index) + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); + } + + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(); + +private: + size_t numInputs_; + size_t numOutputs_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; +}; + +#endif // OP_RUNNER_H diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/inc/operator_desc.h b/cust_op/attention_fusion/aclnn_attention_fusion/inc/operator_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..225e8480fb3389aa0430032107b4cc59219d082c --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/inc/operator_desc.h @@ -0,0 +1,58 @@ +/** +* @file operator_desc.h +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#ifndef OPERATOR_DESC_H +#define OPERATOR_DESC_H + +#include +#include + +#include "acl/acl.h" + +/** + * Op description + */ +struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); + + /** + * Destructor + */ + virtual ~OperatorDesc(); + + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + std::string opType; + std::vector inputDesc; + std::vector outputDesc; + int32_t maskOnOptional; +}; + +#endif // OPERATOR_DESC_H diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/run.sh b/cust_op/attention_fusion/aclnn_attention_fusion/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..754446063c73ce2fc280555d463cf31c82e3a0a3 --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/run.sh @@ -0,0 +1,99 @@ +#!/bin/bash +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=0 + +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) +cd $CURRENT_DIR + +# 导出环境变量 +SHORT=v:, +LONG=dtype:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + # float16, float, int32 + (-v | --dtype) + DTYPE="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +if [ ! $ASCEND_HOME_DIR ]; then + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest + else + export ASCEND_HOME_DIR=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +#source $ASCEND_HOME_DIR/bin/setenv.bash + +export DDK_PATH=$ASCEND_HOME_DIR +arch=$(uname -m) +export NPU_HOST_LIB=$ASCEND_HOME_DIR/${arch}-linux/lib64 + +function main { + # 1. 清除遗留生成文件和日志文件 + rm -rf $HOME/ascend/log/* + rm ./input/*.bin + rm ./output/*.bin + + # 2. 生成输入数据和真值数据 + cd $CURRENT_DIR + python3 scripts/gen_data.py + if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 + fi + echo "INFO: generate input data success!" + + # 3. 编译acl可执行文件 + cd $CURRENT_DIR; rm -rf build; mkdir -p build; cd build + cmake ../src + if [ $? -ne 0 ]; then + echo "ERROR: cmake failed!" + return 1 + fi + echo "INFO: cmake success!" + make + if [ $? -ne 0 ]; then + echo "ERROR: make failed!" + return 1 + fi + echo "INFO: make success!" + + # 4. 运行可执行文件 + cd $CURRENT_DIR/output + echo "INFO: execute op!" + ./execute_attention_fusion_op + + if [ $? -ne 0 ]; then + echo "ERROR: acl executable run failed! please check your project!" + return 1 + fi + echo "INFO: acl executable run success!" + + # 5. 比较真值文件 + cd $CURRENT_DIR + ret=`python3 scripts/verify_result.py output/output_atten_score.bin output/output_softmax_out.bin output/golden_atten_score.bin output/golden_softmax_out.bin ` + echo $ret + if [ "x$ret" == "xtest pass" ]; then + echo "" + echo "#####################################" + echo "INFO: you have passed the Precision!" + echo "#####################################" + echo "" + fi +} + +main diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py b/cust_op/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..2abefa0769120bf070cb39c285d2f8c14f5ee130 --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import os +from math import sqrt + +def softmax(src): + #基于last轴进行rowmax(按行取最大值)处理 + max = np.max(src, axis=-1, keepdims=True) + sub = src - max + exp = np.exp(sub) + #基于last轴进行rowsum(按行求和)处理 + sum = np.sum(exp, axis=-1, keepdims=True) + dst = exp / sum + return dst + + +def gloden_atten_fusion(query, key, value, atten_mask): + qk = np.matmul(query, key.transpose(0, 2, 1)) + attnDimSqrt = 1 / sqrt(query.shape[2]) + attnWeight = np.multiply(qk, attnDimSqrt) + atten_mask = np.add(10000, np.multiply(atten_mask, -10000)) + addMask = np.add(attnWeight, atten_mask) + qk_div = softmax(addMask) + + out = np.matmul(qk_div, value) + return out, qk_div + +def gen_golden_data_simple(): + input_query = np.random.uniform(-1, 1, [1024, 1000, 80]).astype(np.float32) + input_key = np.random.uniform(-1, 1, [1024, 50, 80]).astype(np.float32) + input_value = np.random.uniform(-1, 1, [1024, 50, 80]).astype(np.float32) + input_atten_mask = np.random.randint(0,2,size=(1024, 1000, 50)).astype(np.float32) + + golden_atten_score, gold_softmax_out = gloden_atten_fusion(input_query, input_key, input_value, input_atten_mask) + + os.system("mkdir -p input") + os.system("mkdir -p output") + input_query.tofile("./input/input_query.bin") + input_key.tofile("./input/input_key.bin") + input_value.tofile("./input/input_value.bin") + input_atten_mask.tofile("./input/input_atten_mask.bin") + + golden_atten_score.tofile("./output/golden_atten_score.bin") + gold_softmax_out.tofile("./output/golden_softmax_out.bin") + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/scripts/verify_result.py b/cust_op/attention_fusion/aclnn_attention_fusion/scripts/verify_result.py new file mode 100644 index 0000000000000000000000000000000000000000..5cdcde9a05c1a85e04c28118882fccff3fa4d309 --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/scripts/verify_result.py @@ -0,0 +1,28 @@ +import os +import sys +import numpy as np + +loss = 1e-3 +minimum = 10e-10 + +def verify_result(real_result, golden): + real_result = np.fromfile(real_result, dtype=np.float32) + golden = np.fromfile(golden, dtype=np.float32) + print(real_result[:32]) + print(golden[:32]) + result = np.abs(real_result - golden) + deno = np.maximum(np.abs(real_result), np.abs(golden)) + result_atol = np.less_equal(result, loss) + result_rtol = np.less_equal(result / np.add(deno, minimum), loss) + if not result_rtol.all() and not result_atol.all(): + if np.sum(result_rtol == False) > real_result.size * loss and np.sum(result_atol == False) > real_result.size * loss: + print("[ERROR] result error") + return False + print("test pass") + return True + +if __name__ == '__main__': + print("=============================Softmax out============") + verify_result(sys.argv[2], sys.argv[4]) + print("=============================attn score============") + verify_result(sys.argv[1], sys.argv[3]) \ No newline at end of file diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/src/CMakeLists.txt b/cust_op/attention_fusion/aclnn_attention_fusion/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ad6b958708b628a97719bda7982d2bcbbd8b2de --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/src/CMakeLists.txt @@ -0,0 +1,68 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_attention_fusion) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/attention_fusion/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub/") + set(LIB_PATH1 "/usr/local/Ascend/ascend-toolkit/latest/atc/lib64/stub/") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ${INC_PATH}/runtime/include + ${INC_PATH}/atc/include + ../inc + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${LIB_PATH1} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_attention_fusion_op + operator_desc.cpp + op_runner.cpp + main.cpp + op_runner.cpp + common.cpp +) + +target_link_libraries(execute_attention_fusion_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_attention_fusion_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/src/common.cpp b/cust_op/attention_fusion/aclnn_attention_fusion/src/common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..47a39cd5f190b0472542a2037146d54fc3b5eb0b --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/src/common.cpp @@ -0,0 +1,79 @@ +/** +* @file common.cpp +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include "common.h" + +#include +#include +#include +#include + +extern bool g_isDevice; + +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/src/main.cpp b/cust_op/attention_fusion/aclnn_attention_fusion/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..76a8f61c7038e83a1431d605d1ab264e8843fefa --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/src/main.cpp @@ -0,0 +1,180 @@ +/** +* @file main.cpp +* +* Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "op_runner.h" + +#include "common.h" + +bool g_isDevice = false; +int deviceId = 0; + +OperatorDesc CreateOpDesc() +{ + // define operator + std::vector shapeQuery { 1024, 1000, 80 }; + std::vector shapeKey { 1024, 50, 80 }; + std::vector shapeValue { 1024, 50, 80 }; + std::vector shapeAttenMask { 1024, 1000, 50 }; + std::vector shapeAttenScore{ 1024, 1000, 80 }; + std::vector shapeSoftmaxOut {1024, 1000, 50 }; + aclDataType dataTypeQuery = ACL_FLOAT; + aclDataType dataTypeKey = ACL_FLOAT; + aclDataType dataTypeValue = ACL_FLOAT; + aclDataType dataTypeAttenMask = ACL_FLOAT; + aclDataType dataTypeAttenScore = ACL_FLOAT; + aclDataType dataTypeSoftmaxOut = ACL_FLOAT; + aclFormat format = ACL_FORMAT_ND; + OperatorDesc opDesc; + opDesc.maskOnOptional = 1; + opDesc.AddInputTensorDesc(dataTypeQuery, shapeQuery.size(), shapeQuery.data(), format); + opDesc.AddInputTensorDesc(dataTypeKey, shapeKey.size(), shapeKey.data(), format); + opDesc.AddInputTensorDesc(dataTypeValue, shapeValue.size(), shapeValue.data(), format); + opDesc.AddInputTensorDesc(dataTypeAttenMask, shapeAttenMask.size(), shapeAttenMask.data(), format); + opDesc.AddOutputTensorDesc(dataTypeAttenScore, shapeAttenScore.size(), shapeAttenScore.data(), format); + opDesc.AddOutputTensorDesc(dataTypeSoftmaxOut, shapeSoftmaxOut.size(), shapeSoftmaxOut.data(), format); + return opDesc; +} + +bool SetInputData(OpRunner &runner) +{ + size_t fileSize = 0; + ReadFile("../input/input_query.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0)); + ReadFile("../input/input_key.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1)); + ReadFile("../input/input_value.bin", fileSize, runner.GetInputBuffer(2), runner.GetInputSize(2)); + ReadFile("../input/input_atten_mask.bin", fileSize, runner.GetInputBuffer(3), runner.GetInputSize(3)); + INFO_LOG("Set input success"); + return true; +} + +bool ProcessOutputData(OpRunner &runner) +{ + WriteFile("../output/output_atten_score.bin", runner.GetOutputBuffer(0), runner.GetOutputSize(0)); + WriteFile("../output/output_softmax_out.bin", runner.GetOutputBuffer(1), runner.GetOutputSize(1)); + INFO_LOG("Write output success"); + return true; +} + +void DestoryResource() +{ + bool flag = false; + if (aclrtResetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destory resource failed"); + } else { + INFO_LOG("Destory resource success"); + } +} + +bool InitResource() +{ + std::string output = "../output"; + if (access(output.c_str(), 0) == -1) { + int ret = mkdir(output.c_str(), 0700); + if (ret == 0) { + INFO_LOG("Make output directory successfully"); + } + else { + ERROR_LOG("Make output directory fail"); + return false; + } + } + + // acl.json is dump or profiling config file + if (aclInit(NULL) != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } + + if (aclrtSetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. deviceId is %d", deviceId); + (void)aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestoryResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; +} + +bool RunOp() +{ + // create op desc + OperatorDesc opDesc = CreateOpDesc(); + + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } + + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } + + // Run op + if (!opRunner.RunOp()) { + ERROR_LOG("Run op failed"); + return false; + } + + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + + INFO_LOG("Run op success"); + return true; +} + +int main(int argc, char **argv) +{ + if (!InitResource()) { + ERROR_LOG("Init resource failed"); + return FAILED; + } + INFO_LOG("Init resource success"); + + if (!RunOp()) { + DestoryResource(); + return FAILED; + } + + DestoryResource(); + + return SUCCESS; +} diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp b/cust_op/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0bba4a1b3d14066fd25d2aec906689015533b99e --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp @@ -0,0 +1,466 @@ +/** +* @file op_runner.cpp +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include "op_runner.h" +#include "aclnn_attention_fusion.h" +#include +#include +#include +#include "acl/acl_op_compiler.h" +#include "common.h" + +using namespace std; + +extern bool g_isDevice; + +OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) +{ + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); +} + +OpRunner::~OpRunner() +{ + for (size_t i = 0; i < numInputs_; ++i) { + (void)aclDestroyTensor(inputTensor_[i]); + (void)aclDestroyDataBuffer(inputBuffers_[i]); + (void)aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostInputs_[i]); + } else { + (void)aclrtFreeHost(hostInputs_[i]); + } + } + + for (size_t i = 0; i < numOutputs_; ++i) { + (void)aclDestroyTensor(outputTensor_[i]); + (void)aclDestroyDataBuffer(outputBuffers_[i]); + (void)aclrtFree(devOutputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostOutputs_[i]); + } else { + (void)aclrtFreeHost(hostOutputs_[i]); + } + } +} + +bool OpRunner::Init() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); + + aclTensor *inputTensor = aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), + nullptr, 0, GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); + } + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + devOutputs_.emplace_back(devMem); + outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + + aclTensor *outputTensor = aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i), + nullptr, 0, GetOutputFormat(i), GetOutputShape(i).data(), GetOutputNumDims(i), devOutputs_[i]); + if (outputTensor == nullptr) { + ERROR_LOG("Create Tensor for output[%zu] failed", i); + return false; + } + outputTensor_.emplace_back(outputTensor); + } + + return true; +} + +const size_t OpRunner::NumInputs() +{ + return numInputs_; +} + +const size_t OpRunner::NumOutputs() +{ + return numOutputs_; +} + +const size_t OpRunner::GetInputSize(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->inputDesc[index]); +} + +const size_t OpRunner::GetInputNumDims(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +} + +aclDataType OpRunner::GetInputDataType(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->inputDesc[index]); +} + +aclFormat OpRunner::GetInputFormat(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); +} + +std::vector OpRunner::GetInputShape(size_t index) const +{ + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } + + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + + return ret; +} + +size_t OpRunner::GetOutputSize(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->outputDesc[index]); +} + +const size_t OpRunner::GetOutputNumDims(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); +} + +aclDataType OpRunner::GetOutputDataType(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->outputDesc[index]); +} + + +aclFormat OpRunner::GetOutputFormat(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +} + +std::vector OpRunner::GetOutputShape(size_t index) const +{ + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; + } + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetInputElementCount(size_t index) const +{ + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +} + +size_t OpRunner::GetOutputElementCount(size_t index) const +{ + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); +} + +bool OpRunner::RunOp() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); + } + + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor *handle = nullptr; + auto ret = aclnnAttentionFusionGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], + opDesc_->maskOnOptional, outputTensor_[0], outputTensor_[1], + &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAttentionFusionGetWorkspaceSize success, workspace size %lu", workspaceSize); + + void *workspace = nullptr; + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } + + ret = aclnnAttentionFusion(workspace, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAttentionFusion success"); + + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + auto beforeTime = std::chrono::steady_clock::now(); + for (int i = 0; i<100; i++) { + ret = aclnnAttentionFusionGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], + opDesc_->maskOnOptional, outputTensor_[0], outputTensor_[1], + &workspaceSize, &handle); + ret = aclnnAttentionFusion(workspace, workspaceSize, handle, stream); + } + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + auto afterTime = std::chrono::steady_clock::now(); + double duration_microsecond = std::chrono::duration(afterTime - beforeTime).count(); + std::cout << "time cost " << duration_microsecond/100 << " us" << std::endl; + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } + + (void)aclrtDestroyStream(stream); + return true; +} + + +template +void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } +} + +void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; + } + + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} + +void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; + } + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/src/operator_desc.cpp b/cust_op/attention_fusion/aclnn_attention_fusion/src/operator_desc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a7a1ed3bd9f6b33a7a697c9aa67045112093081a --- /dev/null +++ b/cust_op/attention_fusion/aclnn_attention_fusion/src/operator_desc.cpp @@ -0,0 +1,56 @@ +/** +* @file operator_desc.cpp +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include "common.h" +#include "operator_desc.h" + +using namespace std; + +OperatorDesc::OperatorDesc() {} + +OperatorDesc::~OperatorDesc() +{ + for (auto *desc : inputDesc) { + aclDestroyTensorDesc(desc); + } + + for (auto *desc : outputDesc) { + aclDestroyTensorDesc(desc); + } + +} + +OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, + int numDims, + const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + inputDesc.emplace_back(desc); + return *this; +} + +OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, + int numDims, + const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + + outputDesc.emplace_back(desc); + return *this; +} diff --git a/cust_op/attention_fusion/attention_fusion.json b/cust_op/attention_fusion/attention_fusion.json new file mode 100644 index 0000000000000000000000000000000000000000..130f1c66d9ca5430ac08b16548bea6620ab4cc96 --- /dev/null +++ b/cust_op/attention_fusion/attention_fusion.json @@ -0,0 +1,78 @@ +[ + { + "op": "AttentionFusion", + "language": "cpp", + "input_desc": [ + { + "name": "query", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + }, + { + "name": "key", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + }, + { + "name": "value", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + }, + { + "name": "attn_mask", + "param_type": "optional", + "format": [ + "ND" + ], + "type": [ + "float" + ] + } + ], + "output_desc": [ + { + "name": "atten_score", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + }, + { + "name": "softmax_out", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + } + ], + "attr": [ + { + "name": "mask_on", + "param_type": "optional", + "type": "int", + "default_value": 0 + } + ] + } +] \ No newline at end of file diff --git a/cust_op/attention_fusion/op_host/CMakeLists.txt b/cust_op/attention_fusion/op_host/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..40dd51cfac524b0a9607b7d8b2813edd2210c509 --- /dev/null +++ b/cust_op/attention_fusion/op_host/CMakeLists.txt @@ -0,0 +1,82 @@ + +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs) + +opbuild(OPS_SRC ${ops_srcs} + OUT_DIR ${ASCEND_AUTOGEN_PATH} +) + +add_library(cust_op_proto SHARED ${ops_srcs} ${ASCEND_AUTOGEN_PATH}/op_proto.cc) +target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB) +target_compile_options(cust_op_proto PRIVATE + -fvisibility=hidden +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_op_proto PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_op_proto PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME + cust_opsproto_rt2.0 +) +add_library(cust_optiling SHARED ${ops_srcs}) +target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) +target_compile_options(cust_optiling PRIVATE + -fvisibility=hidden +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_optiling PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_optiling PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME + cust_opmaster_rt2.0 +) + +file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) +file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h) +add_library(cust_opapi SHARED ${aclnn_src}) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_opapi PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase) + +add_custom_target(optiling_compat ALL + COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$ + ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so +) + +install(TARGETS cust_op_proto + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) +install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h + DESTINATION packages/vendors/${vendor_name}/op_proto/inc) +install(TARGETS cust_optiling + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling) +install(TARGETS cust_opapi + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib) +install(FILES ${aclnn_inc} + DESTINATION packages/vendors/${vendor_name}/op_api/include) diff --git a/cust_op/attention_fusion/op_host/attention_fusion.cpp b/cust_op/attention_fusion/op_host/attention_fusion.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4f2b8823faacd4f54bdb6ce5bc756548dd8c9b06 --- /dev/null +++ b/cust_op/attention_fusion/op_host/attention_fusion.cpp @@ -0,0 +1,275 @@ +#include +#include +#include "attention_fusion_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" + +#define TEST_LOG(fmt, args...) fprintf(stdout, fmt "\n", ##args) + +namespace optiling { +#define ge::GRAPH_FAILED 1 +#define ge::GRAPH_SUCCESS 0 +#define ALIGN_32 (32 / sizeof(float)) +#define RESERVER_UB_SIZE (20 * 1024) +#define ALREADY_ALIGNED 1 +#define SPECIAL_CASE 2 +#define SPECIAL_ROW_SIZE (16 * 16) +#define SPECIAL_Q_DIM1 500 +#define SPECIAL_K_DIM1 50 +#define ORIG_UNPAD_DIM0 16 +#define ORIG_UNPAD_DIM1 (16 * 50) +#define TRANSPOSED_PADDED_DIM0 (56 * 16) +#define TRANSPOSED_PADDED_DIM1 16 +#define ORIG_PADDED_DIM0 16 +#define ORIG_PADDED_DIM1 (56 * 16) +#define TRANSPOSED_UNPAD_DIM0 (16 * 50) +#define TRANSPOSED_UNPAD_DIM1 16 +#define UB_TILES 3 +#define DIM0 0 +#define DIM1 1 +#define DIM2 2 + +static int32_t MatmulTiling(gert::TilingContext* context, AttentionFusionTilingData &tilingData) +{ + // q (B, M, K) k (B, N, K) v (B, V, K) + auto qShape = context->GetInputShape(0)->GetStorageShape(); + auto kShape = context->GetInputShape(1)->GetStorageShape(); + auto vShape = context->GetInputShape(2)->GetStorageShape(); + + auto ascnedPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + size_t coreNum = ascnedPlatform.GetCoreNumAic(); + // qkMatmul configuration + matmul_tiling::MultiCoreMatmulTiling qkMm(ascnedPlatform); + qkMm.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + qkMm.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + qkMm.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + qkMm.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + qkMm.SetShape(qShape.GetDim(DIM1), kShape.GetDim(DIM1), kShape.GetDim(DIM2)); + + qkMm.SetSingleShape(qShape.GetDim(DIM1), kShape.GetDim(DIM1), kShape.GetDim(DIM2)); + + qkMm.SetBias(false); + qkMm.SetBufferSpace(-1, -1, -1); + qkMm.SetDim(coreNum); + + // kvBmm Matmul Tilling + matmul_tiling::MultiCoreMatmulTiling kvMm(ascnedPlatform); + kvMm.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + kvMm.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + kvMm.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + kvMm.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + kvMm.SetShape(qShape.GetDim(DIM1), vShape.GetDim(DIM2), kShape.GetDim(DIM1)); + kvMm.SetSingleShape(qShape.GetDim(DIM1), vShape.GetDim(DIM2), kShape.GetDim(DIM1)); + + kvMm.SetBias(false); + kvMm.SetBufferSpace(-1, -1, -1); + kvMm.SetDim(coreNum); + + // Get tilingData using on the kernel side + if (qkMm.GetTiling(tilingData.qkMatmulTiling) == -1 || + kvMm.GetTiling(tilingData.kvMatmulTiling) == -1) { + return ge::GRAPH_FAILED; + } + return ge::GRAPH_SUCCESS; +} + +static int32_t SoftmaxTiling(gert::TilingContext* context, AttentionFusionTilingData &tilingData, uint64_t ub) +{ + auto qShape = context->GetInputShape(0)->GetStorageShape(); + auto kShape = context->GetInputShape(1)->GetStorageShape(); + auto vShape = context->GetInputShape(2)->GetStorageShape(); + const int32_t* maskIsOn = context->GetAttrs()->GetAttrPointer(0); + + int numOfelement = kShape.GetDim(DIM1) / ALIGN_32; + uint8_t attr = 0; + int normalizeColumn = 0; + + // Get column with 32bytes alignment + if ((kShape.GetDim(DIM1) % ALIGN_32) == 0) { + normalizeColumn = numOfelement * ALIGN_32; + attr = ALREADY_ALIGNED; + } else if (kShape.GetDim(DIM1) == SPECIAL_K_DIM1 && qShape.GetDim(DIM1) > SPECIAL_Q_DIM1) { + normalizeColumn = numOfelement * ALIGN_32 + ALIGN_32; + attr = SPECIAL_CASE; + } else { + normalizeColumn = numOfelement * ALIGN_32 + ALIGN_32; + } + + if ((sizeof(float) * normalizeColumn) > (ub / UB_TILES)) { + return ge::GRAPH_FAILED; + } + + // Get how many rows that half of ub contains + int normalizeRow = ub / UB_TILES / (sizeof(float) * normalizeColumn); + if (normalizeRow > qShape.GetDim(DIM1)) { + normalizeRow = qShape.GetDim(DIM1); + } + + if (attr == SPECIAL_CASE) { + normalizeRow = SPECIAL_ROW_SIZE; + } + + // Get max Ub left for softmax shared tmp buffer + uint64_t maxLocalWorkSize = ub - (normalizeRow * normalizeColumn * sizeof(float) * 2); + + const ge::Shape softmaxShape({normalizeRow, normalizeColumn}); + const uint32_t minLocalWorkSize = AscendC::GetSoftMaxMinTmpSize(softmaxShape, sizeof(float), false); + if (minLocalWorkSize > maxLocalWorkSize) { + return ge::GRAPH_FAILED; + } + + // divisor should not be 0 + if (normalizeRow == 0) { + return ge::GRAPH_FAILED; + } + int normalizeLoop = qShape.GetDim(DIM1) / normalizeRow; + normalizeLoop = ((qShape.GetDim(DIM1) % normalizeRow) == 0) ? normalizeLoop : normalizeLoop + 1; + float res = sqrt(qShape.GetDim(DIM2)); + float dimSqrt = (res != 0) ? (1 / res) : 0; + + // set tiling data + tilingData.set_normalizeAttr(attr); + tilingData.set_attnDim(qShape.GetDim(DIM2)); + tilingData.set_queryDim1(qShape.GetDim(DIM1)); + tilingData.set_queryDim2(qShape.GetDim(DIM2)); + tilingData.set_keyDim1(kShape.GetDim(DIM1)); + tilingData.set_valueDim2(vShape.GetDim(DIM2)); + tilingData.set_batchNum(qShape.GetDim(DIM0)); + tilingData.set_normalizeLoop(normalizeLoop); + tilingData.set_normalizeRow(normalizeRow); + tilingData.set_normalizeColumn(normalizeColumn); + tilingData.set_maskIsOn(*maskIsOn); + tilingData.set_normalizeSqrt(dimSqrt); + tilingData.set_maxSharedTmpBuf(maxLocalWorkSize); + + AscendC::SoftMaxTilingFunc(softmaxShape, sizeof(float), maxLocalWorkSize, tilingData.softMaxTilingData); + return ge::GRAPH_SUCCESS; +} + +static ge::graphStatus TilingFunc(gert::TilingContext* context) +{ + AttentionFusionTilingData tilingData; + + // Platform configuration + auto ascnedPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + size_t systemWorkspacesSize = ascnedPlatform.GetLibApiWorkSpaceSize(); + currentWorkspace[0] = 0 + systemWorkspacesSize; + size_t coreNum = ascnedPlatform.GetCoreNumAic(); + + uint64_t ub; + ascnedPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub); + ub = ub - RESERVER_UB_SIZE; + + if (MatmulTiling(context, tilingData) != ge::GRAPH_SUCCESS || + SoftmaxTiling(context, tilingData, ub) != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + + // Get tiling data for transposing the origin tensor, then the transposed tensor is going to be padded. + std::vector shapeVec = {ORIG_UNPAD_DIM0, ORIG_UNPAD_DIM1}; + ge::Shape srcShape(shapeVec); + AscendC::GetConfusionTransposeTilingInfo(srcShape, 0, sizeof(float), 7, tilingData.confusionTransposeTilingData); + + // Get tiling data for transposing the padded tensor. + std::vector shapeVec1 = {TRANSPOSED_PADDED_DIM0, TRANSPOSED_PADDED_DIM1}; + ge::Shape srcShape1(shapeVec1); + AscendC::GetConfusionTransposeTilingInfo(srcShape1, 0, sizeof(float), 7, tilingData.confusionTransposeTilingData1); + + // Get tiling data for transposing the padded tensor, then the transposed tensor is going to be unpadded. + std::vector shapeVec2 = {ORIG_PADDED_DIM0, ORIG_PADDED_DIM1}; + ge::Shape srcShape2(shapeVec2); + AscendC::GetConfusionTransposeTilingInfo(srcShape2, 0, sizeof(float), 7, tilingData.confusionTransposeTilingData2); + + // Get tiling data for transposing the unpadded tensor back to original shape. + std::vector shapeVec3 = {TRANSPOSED_UNPAD_DIM0, TRANSPOSED_UNPAD_DIM1}; + ge::Shape srcShape3(shapeVec3); + AscendC::GetConfusionTransposeTilingInfo(srcShape3, 0, sizeof(float), 7, tilingData.confusionTransposeTilingData3); + + context->SetBlockDim(coreNum); + tilingData.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tilingData.GetDataSize()); + return ge::GRAPH_SUCCESS; +} +} + + +namespace ge { +static ge::graphStatus InferShape(gert::InferShapeContext* context) +{ + const gert::Shape* qShape = context->GetInputShape(0); + const gert::Shape* kShape = context->GetInputShape(1); + const gert::Shape* vShape = context->GetInputShape(2); + + gert::Shape* attnScoreShape = context->GetOutputShape(0); + gert::Shape* softmaxOutShape = context->GetOutputShape(1); + + attnScoreShape->SetDimNum(3); + attnScoreShape->SetDim(0, qShape->GetDim(DIM0)); + attnScoreShape->SetDim(1, qShape->GetDim(DIM1)); + attnScoreShape->SetDim(2, vShape->GetDim(DIM2)); + + softmaxOutShape->SetDimNum(3); + softmaxOutShape->SetDim(0, qShape->GetDim(DIM0)); + softmaxOutShape->SetDim(1, qShape->GetDim(DIM1)); + softmaxOutShape->SetDim(2, kShape->GetDim(DIM1)); + + return GRAPH_SUCCESS; +} +static ge::graphStatus InferDtype(gert::InferDataTypeContext* context) +{ + context->SetOutputDataType(0, context->GetInputDataType(0)); + context->SetOutputDataType(1, context->GetInputDataType(1)); + return GRAPH_SUCCESS; +} +} + + +namespace ops { +class AttentionFusion : public OpDef { +public: + explicit AttentionFusion(const char* name) : OpDef(name) + { + this->Input("query") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("key") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("value") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("attn_mask") + .ParamType(OPTIONAL) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("atten_score") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("softmax_out") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Attr("mask_on").Int(); + + this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDtype); + + this->AICore() + .SetTiling(optiling::TilingFunc); + this->AICore().AddConfig("ascend910b"); + this->AICore().AddConfig("ascend910"); + } +}; + +OP_ADD(AttentionFusion); +} diff --git a/cust_op/attention_fusion/op_host/attention_fusion_tiling.h b/cust_op/attention_fusion/op_host/attention_fusion_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..3a6695e67b4fd172609728ab9b7c36e58c9c0cfb --- /dev/null +++ b/cust_op/attention_fusion/op_host/attention_fusion_tiling.h @@ -0,0 +1,28 @@ +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" +namespace optiling { + BEGIN_TILING_DATA_DEF(AttentionFusionTilingData) + TILING_DATA_FIELD_DEF(uint8_t, normalizeAttr); + TILING_DATA_FIELD_DEF(float, attnDim); + TILING_DATA_FIELD_DEF(int32_t, queryDim1); + TILING_DATA_FIELD_DEF(int32_t, queryDim2); + TILING_DATA_FIELD_DEF(int32_t, keyDim1); + TILING_DATA_FIELD_DEF(int32_t, valueDim2); + TILING_DATA_FIELD_DEF(int32_t, batchNum); + TILING_DATA_FIELD_DEF(int32_t, normalizeLoop); + TILING_DATA_FIELD_DEF(int32_t, normalizeRow); + TILING_DATA_FIELD_DEF(int32_t, normalizeColumn); + TILING_DATA_FIELD_DEF(int32_t, maskIsOn); + TILING_DATA_FIELD_DEF(float, normalizeSqrt); + TILING_DATA_FIELD_DEF(uint64_t, maxSharedTmpBuf); + TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, qkMatmulTiling); + TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, kvMatmulTiling); + TILING_DATA_FIELD_DEF_STRUCT(SoftMaxTiling, softMaxTilingData); + TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData); + TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData1); + TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData2); + TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData3); + END_TILING_DATA_DEF; + + REGISTER_TILING_DATA_CLASS(AttentionFusion, AttentionFusionTilingData) +} diff --git a/cust_op/attention_fusion/op_kernel/CMakeLists.txt b/cust_op/attention_fusion/op_kernel/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f0e48294cfb99987502b97d46ef9ff85bc7ed7f --- /dev/null +++ b/cust_op/attention_fusion/op_kernel/CMakeLists.txt @@ -0,0 +1,60 @@ +# set custom compile options +if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") + add_ops_compile_options(ALL OPTIONS -g -O0) +endif() + +foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) + + # generate aic-${compute_unit}-ops-info.json + add_ops_info_target(TARGET ops_info_gen_${compute_unit} + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit} + ) + + # generate ascendc impl py once + if (NOT TARGET ascendc_impl_gen) + add_ops_impl_target(TARGET ascendc_impl_gen + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR} + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl + ) + endif() + + # dynamic shape binary compile + if (${ENABLE_BINARY_PACKAGE}) + add_bin_compile_target(TARGET ascendc_bin_${compute_unit} + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR} + ADP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/dynamic + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel + COMPUTE_UNIT ${compute_unit} + ) + add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen) + endif() +endforeach() + +# generate npu_supported_ops.json +add_npu_support_target(TARGET npu_supported_ops + OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH} + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core + INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE} +) + +if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) + add_subdirectory(testcases) +endif() + +# install kernel file +if (${ENABLE_SOURCE_PACKAGE}) + file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h + ${CMAKE_CURRENT_SOURCE_DIR}/*.py + ) + install(FILES ${KERNEL_FILES} + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic + ) +endif() diff --git a/cust_op/attention_fusion/op_kernel/attention_fusion.cpp b/cust_op/attention_fusion/op_kernel/attention_fusion.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fe2c88db200763a0d71b3e28851586e901ca3af5 --- /dev/null +++ b/cust_op/attention_fusion/op_kernel/attention_fusion.cpp @@ -0,0 +1,24 @@ +#include "kernel_operator.h" +#include "attention_fusion_kernel.h" +using namespace AscendC; + +// call of kernel function +extern "C" __global__ __aicore__ void attention_fusion(GM_ADDR query, GM_ADDR key, GM_ADDR value, GM_ADDR attnMask, GM_ADDR attenScore, GM_ADDR softmaxOut, GM_ADDR workspace, GM_ADDR tiling) { + GET_TILING_DATA(tiling_data, tiling); + + const TCubeTiling *qkMatmulTiling = &tiling_data.qkMatmulTiling; + const TCubeTiling *kvMatmulTiling = &tiling_data.kvMatmulTiling; + const SoftMaxTiling *softMaxTilingData = &tiling_data.softMaxTilingData; + + AttentionFusionArgs args { + query, key, value, attnMask, attenScore, softmaxOut, tiling_data.normalizeAttr, tiling_data.queryDim1, + tiling_data.queryDim2, tiling_data.keyDim1, tiling_data.valueDim2, tiling_data.batchNum, + tiling_data.normalizeLoop, tiling_data.normalizeRow, tiling_data.normalizeColumn, tiling_data.maskIsOn, + tiling_data.normalizeSqrt, tiling_data.maxSharedTmpBuf, qkMatmulTiling, kvMatmulTiling, + softMaxTilingData, &tiling_data.confusionTransposeTilingData, &tiling_data.confusionTransposeTilingData1, + &tiling_data.confusionTransposeTilingData2, &tiling_data.confusionTransposeTilingData3 + }; + + AttentionFusionKernel kernel; + kernel.Compute(args); +} \ No newline at end of file diff --git a/cust_op/attention_fusion/op_kernel/attention_fusion_kernel.h b/cust_op/attention_fusion/op_kernel/attention_fusion_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..63788fef9e2bf558dc9f034f3888cfa7a6a23f78 --- /dev/null +++ b/cust_op/attention_fusion/op_kernel/attention_fusion_kernel.h @@ -0,0 +1,161 @@ +#ifndef ATTENTION_FUSION_KERNEL_H +#define ATTENTION_FUSION_KERNEL_H + +#include +#include "kernel_operator.h" +#include "lib/matmul_intf.h" +#include "q_k_bmm_compute.h" +#include "k_v_bmm_compute.h" +#include "normalize_compute.h" + +using namespace AscendC; + +struct AttentionFusionArgs { + GM_ADDR query; + GM_ADDR key; + GM_ADDR value; + GM_ADDR attnMask; + GM_ADDR attenScore; + GM_ADDR softmaxOut; + + uint8_t normalizeAttr; + int32_t queryDim1; + int32_t queryDim2; + int32_t keyDim1; + int32_t valueDim2; + int32_t batchNum; + int32_t normalizeLoop; + int32_t normalizeRow; + int32_t normalizeColumn; + int32_t maskIsOn; + float normalizeSqrt; + uint64_t maxSharedTmpBuf; + + const TCubeTiling* qkMatmulTiling; + const TCubeTiling* kvMatmulTiling; + const SoftMaxTiling* softMaxTilingData; + + const ConfusionTransposeTiling* confusionTransposeTilingData; + const ConfusionTransposeTiling* confusionTransposeTilingData1; + const ConfusionTransposeTiling* confusionTransposeTilingData2; + const ConfusionTransposeTiling* confusionTransposeTilingData3; +}; + +struct AttentionFusionPipe { + TPipe* pipe; +}; + +template +__aicore__ inline T1 CeilDiv(T1 a, T2 b) { + if (b == 0) { + return 0; + } + return (a + b -1) / b; +} + +template +class AttentionFusionKernel { + public: + __aicore__ inline AttentionFusionKernel() {}; + + __aicore__ inline void Compute(AttentionFusionArgs args) + { + // Args + this->args = args; + + // Matmul Register + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), qKBmmCompute.mm, args.qkMatmulTiling, kvBmmCompute.mm, + args.kvMatmulTiling); + + // batch offset + GetBatchOffsetAndLen(args.batchNum, this->batchOffset, this->batchLen); + + // QKBmm Initialize + QKBmmArgs qKBmmArgs { + args.query, args.key, args.softmaxOut, + args.queryDim1, args.keyDim1, args.queryDim2, + batchOffset, batchLen + }; + QKBmmPipeArgs qKBmmPipeArgs {&pipe}; + qKBmmCompute.Init(qKBmmArgs, qKBmmPipeArgs); + + NormalizeArgs normalArgs { + &pipe, args.normalizeAttr, args.queryDim1, args.keyDim1, args.normalizeLoop, args.normalizeRow, + args.normalizeColumn, args.maskIsOn, args.normalizeSqrt, args.maxSharedTmpBuf, args.softMaxTilingData, + args.confusionTransposeTilingData, args.confusionTransposeTilingData1, + args.confusionTransposeTilingData2, args.confusionTransposeTilingData3 + }; + normalizeCompute.Init(normalArgs); + + // KVBmm Initialize + KVBmmArgs kvBmmArgs { + args.softmaxOut, args.value, args.attenScore, + args.queryDim1, args.valueDim2, args.keyDim1, + batchOffset + }; + KVBmmPipeArgs kvBmmPipeArgs {&pipe}; + kvBmmCompute.Init(kvBmmArgs, kvBmmPipeArgs); + + // Start compute + Process(); + } + + __aicore__ inline void Process() + { + QKBmmComputePart(); + NormalizeMatmulFusion(); + } + private: + __aicore__ inline void QKBmmComputePart() + { + qKBmmCompute.Process(); + } + + __aicore__ inline void NormalizeMatmulFusion() + { + GlobalTensor softmaxOutGbTensorThisCore; + softmaxOutGbTensorThisCore.SetGlobalBuffer(reinterpret_cast<__gm__ qType*>(args.softmaxOut), + batchLen * args.queryDim1 * args.keyDim1); + GlobalTensor softmaxGbMaskThisCore; + softmaxGbMaskThisCore.SetGlobalBuffer(reinterpret_cast<__gm__ qType*>(args.attnMask), + batchLen * args.queryDim1 * args.keyDim1); + for(int i = 0; i < batchLen + 1; i++) { + if (i != batchLen) { + GlobalTensor softmaxOutGbTensor = + softmaxOutGbTensorThisCore[(batchOffset + i) * args.queryDim1 * args.keyDim1]; + GlobalTensor softmaxGbMaskTensor = + softmaxGbMaskThisCore[(batchOffset + i) * args.queryDim1 * args.keyDim1]; + /* normallize */ + normalizeCompute.Process(softmaxOutGbTensor, softmaxGbMaskTensor); + } + + if (i != 0) { + /* matmul */ + kvBmmCompute.ComputeOneBatch(i - 1); + } + } + } + + __aicore__ inline void GetBatchOffsetAndLen(int batchNum, int& batchOffset, int& batchLen) + { + // batch offset + int blockLenPerCore = CeilDiv(batchNum, (GetBlockNum() * 2)); + batchOffset = blockLenPerCore*GetBlockIdx(); + batchLen = blockLenPerCore; + if (batchOffset + batchLen > batchNum) { + batchLen = batchNum - batchOffset; + } + } + + private: + TPipe pipe; + AttentionFusionArgs args; + QKBmmCompute qKBmmCompute; + KVBmmCompute kvBmmCompute; + NormalizeCompute normalizeCompute; + + int batchOffset; + int batchLen; +}; + +#endif \ No newline at end of file diff --git a/cust_op/attention_fusion/op_kernel/k_v_bmm_compute.h b/cust_op/attention_fusion/op_kernel/k_v_bmm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..16a7ca7f92b2e01201042721df83915071e4fac8 --- /dev/null +++ b/cust_op/attention_fusion/op_kernel/k_v_bmm_compute.h @@ -0,0 +1,69 @@ +#ifndef KV_BMMM_COMPUTE__H +#define KV_BMMM_COMPUTE__H + +#include "kernel_operator.h" +using namespace AscendC; + + +struct KVBmmArgs { + GM_ADDR softmaxOut; + GM_ADDR value; + GM_ADDR out; + + int M; + int N; + int K; + + int batchOffset; +}; + +struct KVBmmPipeArgs { + TPipe* pipe; +}; + +template +class KVBmmCompute { +public: + __aicore__ inline KVBmmCompute(){} + + __aicore__ inline void Init(KVBmmArgs kvBmmArgs, KVBmmPipeArgs pipeArgs) + { + this->kvBmmArgs = kvBmmArgs; + this->pipeArgs = pipeArgs; + + // kernel batch offset + sGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ sType*>(kvBmmArgs.softmaxOut), kvBmmArgs.M * kvBmmArgs.K); + sGlobal = sGlobal[kvBmmArgs.batchOffset * kvBmmArgs.M * kvBmmArgs.K]; + vGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ vType*>(kvBmmArgs.value), kvBmmArgs.N * kvBmmArgs.K); + vGlobal = vGlobal[kvBmmArgs.batchOffset * kvBmmArgs.N * kvBmmArgs.K]; + outGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ vType*>(kvBmmArgs.out), kvBmmArgs.M * kvBmmArgs.N); + outGlobal = outGlobal[kvBmmArgs.batchOffset * kvBmmArgs.M * kvBmmArgs.N]; + } + + __aicore__ inline void ComputeOneBatch(int batchI) + { + if (batchI != 0) { + mm.WaitIterateAll(); + mm.End(); + } + + mm.SetTensorA(sGlobal[batchI * kvBmmArgs.M * kvBmmArgs.K]); + mm.SetTensorB(vGlobal[batchI * kvBmmArgs.N * kvBmmArgs.K]); + + mm.template IterateAll(outGlobal[batchI * kvBmmArgs.M * kvBmmArgs.N], 0, false, true); + } + + matmul::Matmul< + matmul::MatmulType, + matmul::MatmulType, + matmul::MatmulType, + matmul::MatmulType + > mm; +private: + KVBmmArgs kvBmmArgs; + KVBmmPipeArgs pipeArgs; + GlobalTensor sGlobal; + GlobalTensor vGlobal; + GlobalTensor outGlobal; +}; +#endif \ No newline at end of file diff --git a/cust_op/attention_fusion/op_kernel/normalize_compute.h b/cust_op/attention_fusion/op_kernel/normalize_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..94f44a1c9f69c3eb0c192d8eb9b3756106985bed --- /dev/null +++ b/cust_op/attention_fusion/op_kernel/normalize_compute.h @@ -0,0 +1,241 @@ +#ifndef NORMALIZE_COMPUTE__H +#define NORMALIZE_COMPUTE__H +#include +#include "kernel_operator.h" +using namespace AscendC; +#define ALIGN_32 32 +#define ALREADY_ALIGNED 1 +#define SPECIAL_CASE 2 +#define PAD_SIZE (16 * 56 * 16) +#define PAD_SIZE (16 * 56 * 16) +#define SPECIAL_BLOCK_COUNT 16 +#define SPECIAL_BLOCK_LEN (16 * 56 * 16) +#define SPECIAL_STRIDE (6 * 16 / 8) +#define PAD_VALUE -1000 + +struct NormalizeArgs { + TPipe* pipe; + + uint8_t attr; + int queryDim1; + int keyDim1; + int loopCount; + int normalizeRow; + int normalizeColumn; + int maskIsOn; + float normalizeSqrt; + uint64_t maxSharedTmpBuf; + + const SoftMaxTiling* tiling; + + const ConfusionTransposeTiling* confusionTransposeTilingData; + const ConfusionTransposeTiling* confusionTransposeTilingData1; + const ConfusionTransposeTiling* confusionTransposeTilingData2; + const ConfusionTransposeTiling* confusionTransposeTilingData3; +}; + +template +class NormalizeCompute { +public: + __aicore__ inline NormalizeCompute(){} + + __aicore__ inline void Init(NormalizeArgs normalArgs) + { + this->args = normalArgs; + int bufSize = args.normalizeRow * args.normalizeColumn * sizeof(qType); + args.pipe->InitBuffer(vecInQueue, 1, bufSize); + args.pipe->InitBuffer(vecOutQueue, 1, bufSize); + args.pipe->InitBuffer(vecSharedQueue, 1, args.maxSharedTmpBuf); + } + + __aicore__ inline void DoPadLocal(LocalTensor& sourceTensor, LocalTensor& mindTensor, + const ConfusionTransposeTiling* confusionTransposeTilingData, + const ConfusionTransposeTiling* confusionTransposeTilingData1) + { + ConfusionTransposeTiling tiling = *confusionTransposeTilingData; + ConfusionTranspose(mindTensor, sourceTensor, TransposeType::TRANSPOSE_ND2ND_ONLY, tiling); + Duplicate(sourceTensor, PAD_VALUE, PAD_SIZE); + DataCopyParams dataCopyParam = {0, 0, 0, 0}; + dataCopyParam.blockCount = SPECIAL_BLOCK_COUNT; + dataCopyParam.blockLen = SPECIAL_BLOCK_LEN; + dataCopyParam.srcStride = 0; + dataCopyParam.dstStride = SPECIAL_STRIDE; + DataCopy(sourceTensor, mindTensor, dataCopyParam); + + ConfusionTransposeTiling tiling1 = *confusionTransposeTilingData1; + ConfusionTranspose(mindTensor, sourceTensor, TransposeType::TRANSPOSE_ND2ND_ONLY, tiling1); + DataCopy(sourceTensor, mindTensor, PAD_SIZE); + } + + + __aicore__ inline void DoUnPadLocal(LocalTensor& sourceTensor, LocalTensor& mindTensor, + const ConfusionTransposeTiling* confusionTransposeTilingData2, + const ConfusionTransposeTiling* confusionTransposeTilingData3) + { + ConfusionTransposeTiling tiling = *confusionTransposeTilingData2; + ConfusionTranspose(mindTensor, sourceTensor, TransposeType::TRANSPOSE_ND2ND_ONLY, tiling); + Duplicate(sourceTensor, PAD_VALUE, PAD_SIZE); + DataCopyParams dataCopyParam = {0, 0, 0, 0}; + dataCopyParam.blockCount = SPECIAL_BLOCK_COUNT; + dataCopyParam.blockLen = SPECIAL_BLOCK_LEN; + dataCopyParam.srcStride = SPECIAL_STRIDE; + dataCopyParam.dstStride = 0; + DataCopy(sourceTensor, mindTensor, dataCopyParam); + + ConfusionTransposeTiling tiling1 = *confusionTransposeTilingData3; + ConfusionTranspose(mindTensor, sourceTensor, TransposeType::TRANSPOSE_ND2ND_ONLY, tiling1); + DataCopy(sourceTensor, mindTensor, PAD_SIZE); + } + + __aicore__ inline void Process(GlobalTensor softmaxGlobleTensor, GlobalTensor softmaxGbMask) + { + srcGloblePtr = softmaxGlobleTensor; + maskGloblePtr = softmaxGbMask; + offset = 0; + usedRowCount = 0; + uint8_t padLen = args.normalizeColumn - args.keyDim1; + padParams = {false, 0, padLen, 0}; + + for (int i = 0; i < args.loopCount; i++) { + /* Get height of softmax matrix and handle the last loop height */ + height = ((args.queryDim1 - usedRowCount) < args.normalizeRow) ? + args.queryDim1 - usedRowCount : args.normalizeRow; + totalSize = height * args.normalizeColumn; + + CopyIn(); + PreCompute(); + Compute(); + CopyOut(); + + usedRowCount += height; + offset += args.normalizeRow * args.keyDim1; + } + } + +private: + __aicore__ inline void CopyMask() + { + LocalTensor LocalMask = vecSharedQueue.AllocTensor(); + if (args.attr == ALREADY_ALIGNED) { + DataCopy(LocalMask, maskGloblePtr[offset], totalSize); + } else if (args.attr == SPECIAL_CASE) { + DataCopy(LocalMask, maskGloblePtr[offset], totalSize); + } else { + DataCopyPad(LocalMask, maskGloblePtr[offset], copyParams, padParams); + } + + vecSharedQueue.EnQue(LocalMask); + } + + __aicore__ inline void CopyIn() + { + LocalTensor inLocalTensor = vecInQueue.AllocTensor(); + LocalTensor outLocalTensor = vecOutQueue.AllocTensor(); + + if (args.attr == ALREADY_ALIGNED) { + DataCopy(inLocalTensor, srcGloblePtr[offset], totalSize); + } else if (args.attr == SPECIAL_CASE) { + DataCopy(inLocalTensor, srcGloblePtr[offset], totalSize); + } else { + copyParams.blockCount = height; + copyParams.blockLen = args.keyDim1 * sizeof(qType); + DataCopyPad(inLocalTensor, srcGloblePtr[offset], copyParams, padParams); + } + + if (args.maskIsOn == 1) { + CopyMask(); + } + + vecInQueue.EnQue(inLocalTensor); + vecOutQueue.EnQue(outLocalTensor); + } + + __aicore__ inline void PreCompute() + { + LocalTensor inLocalTensor = vecInQueue.DeQue(); + LocalTensor outLocalTensor = vecOutQueue.DeQue(); + + if (args.attr == SPECIAL_CASE && args.maskIsOn == 1) { + LocalTensor LocalMask = vecSharedQueue.DeQue(); + DoPadLocal(LocalMask, outLocalTensor, args.confusionTransposeTilingData, + args.confusionTransposeTilingData1); + DoPadLocal(inLocalTensor, outLocalTensor, args.confusionTransposeTilingData, + args.confusionTransposeTilingData1); + vecSharedQueue.EnQue(LocalMask); + } else if (args.attr == SPECIAL_CASE) { + DoPadLocal(inLocalTensor, outLocalTensor, args.confusionTransposeTilingData, + args.confusionTransposeTilingData1); + } + + // atten_weight = qkMatMul / sqrt(atten_dim) + Muls(inLocalTensor, inLocalTensor, args.normalizeSqrt, totalSize); + + if (args.maskIsOn == 1) { + LocalTensor LocalMask = vecSharedQueue.DeQue(); + // atten_mask = (1 - mask) * 10000 + Muls(LocalMask, LocalMask, (float)-10000, totalSize); + Adds(LocalMask, LocalMask, (float)10000, totalSize); + + // atten_weight = atten_weight + atten_mask + Add(inLocalTensor, inLocalTensor, LocalMask, totalSize); + vecSharedQueue.FreeTensor(LocalMask); + } + + vecInQueue.EnQue(inLocalTensor); + vecOutQueue.EnQue(outLocalTensor); + } + + __aicore__ inline void Compute() + { + LocalTensor inLocalTensor = vecInQueue.DeQue(); + LocalTensor outLocalTensor = vecOutQueue.DeQue(); + LocalTensor sharedTmpBuf = vecSharedQueue.AllocTensor(); + + SoftMaxShapeInfo scrShape ={height, (uint32_t)args.normalizeColumn, height, (uint32_t)args.keyDim1}; + SoftMax(outLocalTensor, inLocalTensor, sharedTmpBuf, *args.tiling, scrShape); + + if (args.attr == SPECIAL_CASE) { + DoUnPadLocal(outLocalTensor, inLocalTensor, args.confusionTransposeTilingData2, + args.confusionTransposeTilingData3); + } + vecOutQueue.EnQue(outLocalTensor); + vecInQueue.FreeTensor(inLocalTensor); + vecSharedQueue.FreeTensor(sharedTmpBuf); + } + + __aicore__ inline void CopyOut() + { + LocalTensor outLocalTensor = vecOutQueue.DeQue(); + + if (args.attr == ALREADY_ALIGNED) { + DataCopy(srcGloblePtr[offset], outLocalTensor, totalSize); + } else if (args.attr == SPECIAL_CASE) { + uint32_t thisLen = height * args.keyDim1 * sizeof(qType); + if ((thisLen % ALIGN_32) != 0) { + DataCopyExtParams dataCopyParamTail {1, thisLen, 0, 0, 0}; + DataCopyPad(srcGloblePtr[offset], outLocalTensor, dataCopyParamTail); + } else { + DataCopy(srcGloblePtr[offset], outLocalTensor, height * args.keyDim1); + } + } else { + DataCopyPad(srcGloblePtr[offset], outLocalTensor, copyParams); + } + vecOutQueue.FreeTensor(outLocalTensor); + } + +private: + NormalizeArgs args; + TQue vecInQueue; + TQue vecOutQueue; + TQue vecSharedQueue; + + GlobalTensor srcGloblePtr; + GlobalTensor maskGloblePtr; + uint32_t height = 0; + int offset = 0; + int usedRowCount = 0; + uint32_t totalSize = 0; + struct DataCopyExtParams copyParams; + struct DataCopyPadExtParams padParams; +}; +#endif \ No newline at end of file diff --git a/cust_op/attention_fusion/op_kernel/q_k_bmm_compute.h b/cust_op/attention_fusion/op_kernel/q_k_bmm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..3fb2d3d14e015138b14a94c7d6faa817dc329e43 --- /dev/null +++ b/cust_op/attention_fusion/op_kernel/q_k_bmm_compute.h @@ -0,0 +1,73 @@ +#ifndef QK_BMMM_COMPUTE__H +#define QK_BMMM_COMPUTE__H +#include +#include "attention_fusion_kernel.h" +#include "kernel_operator.h" +#include "lib/matmul_intf.h" + +using namespace AscendC; + +struct QKBmmArgs { + GM_ADDR query; + GM_ADDR key; + GM_ADDR out; + + int dimM; + int dimN; + int dimK; + + int batchOffset; + int batchLen; +}; + +struct QKBmmPipeArgs { + TPipe* pipe; +}; + +template +class QKBmmCompute { +public: + __aicore__ inline QKBmmCompute() {} + + __aicore__ inline void Init(QKBmmArgs qKBmmArgs, QKBmmPipeArgs pipeArgs) + { + this->qKBmmArgs = qKBmmArgs; + + // kernel batch offset + qGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ qType*>(qKBmmArgs.query), + qKBmmArgs.batchLen * qKBmmArgs.dimM * qKBmmArgs.dimK); + qGlobal = qGlobal[qKBmmArgs.batchOffset * qKBmmArgs.dimM * qKBmmArgs.dimK]; + + kGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ kType*>(qKBmmArgs.key), + qKBmmArgs.batchLen * qKBmmArgs.dimN * qKBmmArgs.dimK); + kGlobal = kGlobal[qKBmmArgs.batchOffset * qKBmmArgs.dimN * qKBmmArgs.dimK]; + + outGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ kType*>(qKBmmArgs.out), + qKBmmArgs.batchLen * qKBmmArgs.dimM * qKBmmArgs.dimN); + outGlobal = outGlobal[qKBmmArgs.batchOffset * qKBmmArgs.dimM * qKBmmArgs.dimN]; + } + + __aicore__ inline void Process() + { + for (int thisBatch = 0 ; thisBatch < qKBmmArgs.batchLen; thisBatch++) { + mm.SetTensorA(qGlobal[thisBatch * qKBmmArgs.dimM * qKBmmArgs.dimK]); + mm.SetTensorB(kGlobal[thisBatch * qKBmmArgs.dimN * qKBmmArgs.dimK], true); + + mm.IterateAll(outGlobal[thisBatch * qKBmmArgs.dimM * qKBmmArgs.dimN], 0, false); + } + mm.End(); + } + + matmul::Matmul< + matmul::MatmulType, + matmul::MatmulType , + matmul::MatmulType, + matmul::MatmulType + > mm; +private: + QKBmmArgs qKBmmArgs; + GlobalTensor qGlobal; + GlobalTensor kGlobal; + GlobalTensor outGlobal; +}; +#endif \ No newline at end of file diff --git a/cust_op/attention_fusion/run.sh b/cust_op/attention_fusion/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9f2acd0217cdb96e3b3b08b2ace47a9075674292 --- /dev/null +++ b/cust_op/attention_fusion/run.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e + +# 查找msopgen的路径,加入到环境变量PATH中 +msopgen_path=$(find /usr/local/Ascend/ -name msopgen | grep bin) +parent_dir=$(dirname "$msopgen_path") +export PATH=$parent_dir:$PATH + +# 利用msopgen生成可编译文件 +rm -rf ./attention_fusion +/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/bin/msopgen gen -i attention_fusion.json -f tf -c ai_core-Ascend910B1 -lan cpp -out ./attention_fusion -m 0 -op AttentionFusion +rm -rf attention_fusion/op_kernel +rm -rf attention_fusion/host +cp -rf op_kernel attention_fusion/ +cp -rf op_host attention_fusion/ + +cd attention_fusion + +# 判断当前目录下是否存在CMakePresets.json文件 +if [ ! -f "CMakePresets.json" ]; then + echo "ERROR, CMakePresets.json file not exist." + exit 1 +fi + +# 禁止生成CRC校验和 +sed -i 's/--nomd5/--nomd5 --nocrc/g' ./cmake/makeself.cmake + +# 修改cann安装路径 +sed -i 's:"/usr/local/Ascend/latest":"/usr/local/Ascend/ascend-toolkit/latest":g' CMakePresets.json +# 修改vendor_name 防止覆盖之前vendor_name为customize的算子; +# vendor_name需要和aclnn中的CMakeLists.txt中的CUST_PKG_PATH值同步,不同步aclnn会调用失败; +# vendor_name字段值不能包含customize;包含会导致多算子部署场景CANN的vendors路径下config.ini文件内容截取错误 +sed -i 's:"customize":"attention_fusion":g' CMakePresets.json + +bash build.sh + +# # 安装编译成功的算子包 +bash ./build_out/custom_opp*.run +cd ../aclnn_attention_fusion +bash run.sh +# cd .. +# rm -rf ./attenion_fusion \ No newline at end of file