diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/inc/common.h b/cust_op/attention_fusion/aclnn_attention_fusion/inc/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..7036813a972219f046e079072fe2e1ed94670e5f
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/inc/common.h
@@ -0,0 +1,45 @@
+/**
+* @file common.h
+*
+* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+
+#include "acl/acl.h"
+
+#define SUCCESS 0
+#define FAILED 1
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR]  " fmt "\n", ##args)
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize);
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size);
+
+#endif // COMMON_H
diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/inc/op_runner.h b/cust_op/attention_fusion/aclnn_attention_fusion/inc/op_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf2a9ef43d12950dae6f19b98d48f5a6a36ac02f
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/inc/op_runner.h
@@ -0,0 +1,182 @@
+/**
+* @file op_runner.h
+*
+* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+#ifndef OP_RUNNER_H
+#define OP_RUNNER_H
+
+#include "aclnn/acl_meta.h"
+#include "acl/acl.h"
+#include "common.h"
+#include "operator_desc.h"
+
+/**
+ * Op Runner
+ */
+class OpRunner {
+public:
+    /**
+     * @brief Constructor
+     * @param [in] opDesc: op description
+     */
+    explicit OpRunner(OperatorDesc *opDesc);
+
+    /**
+     * @brief Destructor
+     */
+    virtual ~OpRunner();
+
+    /**
+    * @brief Init op runner
+    */
+    bool Init();
+
+    /**
+     * @brief Get number of inputs
+     * @return number of inputs
+     */
+    const size_t NumInputs();
+
+    /**
+     * @brief Get number of outputs
+     * @return number of outputs
+     */
+    const size_t NumOutputs();
+
+    /**
+     * @brief Get input size by index
+     * @param [in] index: input index
+     * @return size of the input
+     */
+    const size_t GetInputSize(size_t index) const;
+    const size_t GetInputNumDims(size_t index) const;
+    aclDataType GetInputDataType(size_t index) const;
+    aclFormat GetInputFormat(size_t index) const;
+
+    /**
+     * @brief Get output size by index
+     * @param [in] index: output index
+     * @return size of the output
+     */
+    size_t GetOutputSize(size_t index) const;
+    const size_t GetOutputNumDims(size_t index) const;
+    aclDataType GetOutputDataType(size_t index) const;
+    aclFormat GetOutputFormat(size_t index) const;
+
+    /**
+     * @brief Get input element count by index
+     * @param i[in] ndex: input index
+     * @return element count of the input
+     */
+    size_t GetInputElementCount(size_t index) const;
+
+    /**
+     * @brief Get output element count by index
+     * @param [in] index: output index
+     * @return element count of the output
+     */
+    size_t GetOutputElementCount(size_t index) const;
+
+    /**
+     * @brief Get input shape by index
+     * @param [in] index: input index
+     * @return shape of the output
+     */
+    std::vector<int64_t> GetInputShape(size_t index) const;
+
+    /**
+     * @brief Get output shape by index
+     * @param [in] index: output index
+     * @return shape of the output
+     */
+    std::vector<int64_t> GetOutputShape(size_t index) const;
+
+    /**
+     * @brief Get input buffer(host memory) by index
+     * @tparam T: data type
+     * @param [in] index: input index
+     * @return host address of the input
+     */
+    template<typename T>
+    T *GetInputBuffer(size_t index)
+    {
+        if (index >= numInputs_) {
+            ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+            return nullptr;
+        }
+        return reinterpret_cast<T *>(hostInputs_[index]);
+    }
+
+    /**
+     * @brief Get output buffer(host memory) by index
+     * @tparam T: data type
+     * @param [in] index: output index
+     * @return host address of the output
+     */
+    template<typename T>
+    const T *GetOutputBuffer(size_t index)
+    {
+        if (index >= numOutputs_) {
+            ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+            return nullptr;
+        }
+
+        return reinterpret_cast<T *>(hostOutputs_[index]);
+    }
+
+     /**
+      * @brief Print readable input by index
+      * @param [in] index: input index
+      * @param [in] elementsPerRow: number of elements per row
+      */
+    void PrintInput(size_t index, size_t elementsPerRow = 16);
+
+    /**
+      * @brief Print readable output by index
+      * @param [in] index: output index
+      * @param [in] elementsPerRow: number of elements per row
+      */
+    void PrintOutput(size_t index, size_t elementsPerRow = 16);
+
+    /**
+     * @brief Compile static op
+     * @return compile result
+     */
+    bool CompileStaticOp();
+
+    /**
+     * @brief Compile dynamic op
+     * @return compile result
+     */
+    bool CompileDynamicOp();
+
+    /**
+     * @brief Run op
+     * @return run result
+     */
+    bool RunOp();
+
+private:
+    size_t numInputs_;
+    size_t numOutputs_;
+
+    std::vector<aclDataBuffer *> inputBuffers_;
+    std::vector<aclDataBuffer *> outputBuffers_;
+
+    std::vector<void *> devInputs_;
+    std::vector<void *> devOutputs_;
+
+    std::vector<void *> hostInputs_;
+    std::vector<void *> hostOutputs_;
+
+    std::vector<aclTensor *> inputTensor_;
+    std::vector<aclTensor *> outputTensor_;
+    OperatorDesc *opDesc_;
+};
+
+#endif // OP_RUNNER_H
diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/inc/operator_desc.h b/cust_op/attention_fusion/aclnn_attention_fusion/inc/operator_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..225e8480fb3389aa0430032107b4cc59219d082c
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/inc/operator_desc.h
@@ -0,0 +1,58 @@
+/**
+* @file operator_desc.h
+*
+* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+#ifndef OPERATOR_DESC_H
+#define OPERATOR_DESC_H
+
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+/**
+ * Op description
+ */
+struct OperatorDesc {
+    /**
+     * Constructor
+     */
+    explicit OperatorDesc();
+
+    /**
+     * Destructor
+     */
+    virtual ~OperatorDesc();
+
+    /**
+     * Add an input tensor description
+     * @param [in] dataType: data type
+     * @param [in] numDims: number of dims
+     * @param [in] dims: dims
+     * @param [in] format: format
+     * @return OperatorDesc
+     */
+    OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
+
+    /**
+     * Add an output tensor description
+     * @param [in] dataType: data type
+     * @param [in] numDims: number of dims
+     * @param [in] dims: dims
+     * @param [in] format: format
+     * @return OperatorDesc
+     */
+    OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
+
+    std::string opType;
+    std::vector<aclTensorDesc *> inputDesc;
+    std::vector<aclTensorDesc *> outputDesc;
+    int32_t maskOnOptional;
+};
+
+#endif // OPERATOR_DESC_H
diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/run.sh b/cust_op/attention_fusion/aclnn_attention_fusion/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..754446063c73ce2fc280555d463cf31c82e3a0a3
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/run.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+export ASCEND_GLOBAL_LOG_LEVEL=0
+
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+# 导出环境变量
+SHORT=v:,
+LONG=dtype:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        # float16, float, int32
+        (-v | --dtype)
+            DTYPE="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+if [ ! $ASCEND_HOME_DIR ]; then
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest
+    else
+        export ASCEND_HOME_DIR=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+#source $ASCEND_HOME_DIR/bin/setenv.bash
+
+export DDK_PATH=$ASCEND_HOME_DIR
+arch=$(uname -m)
+export NPU_HOST_LIB=$ASCEND_HOME_DIR/${arch}-linux/lib64
+
+function main {
+    # 1. 清除遗留生成文件和日志文件
+    rm -rf $HOME/ascend/log/*
+    rm ./input/*.bin
+    rm ./output/*.bin
+
+    # 2. 生成输入数据和真值数据
+    cd $CURRENT_DIR
+    python3 scripts/gen_data.py
+    if [ $? -ne 0 ]; then
+        echo "ERROR: generate input data failed!"
+        return 1
+    fi
+    echo "INFO: generate input data success!"
+
+    # 3. 编译acl可执行文件
+    cd $CURRENT_DIR; rm -rf build; mkdir -p build; cd build
+    cmake ../src
+    if [ $? -ne 0 ]; then
+        echo "ERROR: cmake failed!"
+        return 1
+    fi
+    echo "INFO: cmake success!"
+    make
+    if [ $? -ne 0 ]; then
+        echo "ERROR: make failed!"
+        return 1
+    fi
+    echo "INFO: make success!"
+
+    # 4. 运行可执行文件
+    cd $CURRENT_DIR/output
+    echo "INFO: execute op!"
+    ./execute_attention_fusion_op
+
+    if [ $? -ne 0 ]; then
+        echo "ERROR: acl executable run failed! please check your project!"
+        return 1
+    fi
+    echo "INFO: acl executable run success!"
+
+    # 5. 比较真值文件
+    cd $CURRENT_DIR
+    ret=`python3 scripts/verify_result.py output/output_atten_score.bin output/output_softmax_out.bin output/golden_atten_score.bin output/golden_softmax_out.bin `
+    echo $ret
+    if [ "x$ret" == "xtest pass" ]; then
+        echo ""
+        echo "#####################################"
+        echo "INFO: you have passed the Precision!"
+        echo "#####################################"
+        echo ""
+    fi
+}
+
+main
diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py b/cust_op/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..2abefa0769120bf070cb39c285d2f8c14f5ee130
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+# Copyright 2022-2023 Huawei Technologies Co., Ltd
+import numpy as np
+import os
+from math import sqrt
+
+def softmax(src):
+    #基于last轴进行rowmax(按行取最大值)处理
+    max = np.max(src, axis=-1, keepdims=True)
+    sub = src - max
+    exp = np.exp(sub)
+    #基于last轴进行rowsum(按行求和)处理
+    sum = np.sum(exp, axis=-1, keepdims=True)
+    dst = exp / sum
+    return dst
+
+
+def gloden_atten_fusion(query, key, value, atten_mask):
+    qk = np.matmul(query, key.transpose(0, 2, 1))
+    attnDimSqrt = 1 / sqrt(query.shape[2])
+    attnWeight = np.multiply(qk, attnDimSqrt)
+    atten_mask = np.add(10000, np.multiply(atten_mask, -10000))
+    addMask = np.add(attnWeight, atten_mask)
+    qk_div = softmax(addMask)
+
+    out = np.matmul(qk_div, value)
+    return out, qk_div
+
+def gen_golden_data_simple():
+    input_query = np.random.uniform(-1, 1, [1024, 1000, 80]).astype(np.float32)
+    input_key = np.random.uniform(-1, 1, [1024, 50, 80]).astype(np.float32)
+    input_value = np.random.uniform(-1, 1, [1024, 50, 80]).astype(np.float32)
+    input_atten_mask = np.random.randint(0,2,size=(1024, 1000, 50)).astype(np.float32)
+
+    golden_atten_score, gold_softmax_out = gloden_atten_fusion(input_query, input_key, input_value, input_atten_mask)
+
+    os.system("mkdir -p input")
+    os.system("mkdir -p output")
+    input_query.tofile("./input/input_query.bin")
+    input_key.tofile("./input/input_key.bin")
+    input_value.tofile("./input/input_value.bin")
+    input_atten_mask.tofile("./input/input_atten_mask.bin")
+    
+    golden_atten_score.tofile("./output/golden_atten_score.bin")
+    gold_softmax_out.tofile("./output/golden_softmax_out.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/scripts/verify_result.py b/cust_op/attention_fusion/aclnn_attention_fusion/scripts/verify_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cdcde9a05c1a85e04c28118882fccff3fa4d309
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/scripts/verify_result.py
@@ -0,0 +1,28 @@
+import os
+import sys
+import numpy as np
+
+loss = 1e-3
+minimum = 10e-10
+
+def verify_result(real_result, golden):
+    real_result = np.fromfile(real_result, dtype=np.float32)
+    golden = np.fromfile(golden, dtype=np.float32)
+    print(real_result[:32])
+    print(golden[:32])
+    result = np.abs(real_result - golden)
+    deno = np.maximum(np.abs(real_result), np.abs(golden))
+    result_atol = np.less_equal(result, loss)
+    result_rtol = np.less_equal(result / np.add(deno, minimum), loss)
+    if not result_rtol.all() and not result_atol.all():
+        if np.sum(result_rtol == False) > real_result.size * loss and np.sum(result_atol == False) > real_result.size * loss:
+            print("[ERROR] result error")
+            return False
+    print("test pass")
+    return True
+
+if __name__ == '__main__':
+    print("=============================Softmax out============")
+    verify_result(sys.argv[2], sys.argv[4])
+    print("=============================attn score============")
+    verify_result(sys.argv[1], sys.argv[3])
\ No newline at end of file
diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/src/CMakeLists.txt b/cust_op/attention_fusion/aclnn_attention_fusion/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0ad6b958708b628a97719bda7982d2bcbbd8b2de
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/src/CMakeLists.txt
@@ -0,0 +1,68 @@
+# Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+
+# CMake lowest version requirement
+cmake_minimum_required(VERSION 3.5.1)
+
+# project information
+project(acl_execute_attention_fusion)
+
+# Compile options
+add_compile_options(-std=c++11)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output")
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output")
+
+set(INC_PATH $ENV{DDK_PATH})
+
+if (NOT DEFINED ENV{DDK_PATH})
+    set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest")
+    message(STATUS "set default INC_PATH: ${INC_PATH}")
+else ()
+    message(STATUS "env INC_PATH: ${INC_PATH}")
+endif()
+
+set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/attention_fusion/op_api")
+
+set(LIB_PATH $ENV{NPU_HOST_LIB})
+
+# Dynamic libraries in the stub directory can only be used for compilation
+if (NOT DEFINED ENV{NPU_HOST_LIB})
+    set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub/")
+    set(LIB_PATH1 "/usr/local/Ascend/ascend-toolkit/latest/atc/lib64/stub/")
+    message(STATUS "set default LIB_PATH: ${LIB_PATH}")
+else ()
+    message(STATUS "env LIB_PATH: ${LIB_PATH}")
+endif()
+
+# Header path
+include_directories(
+    ${INC_PATH}/runtime/include
+    ${INC_PATH}/atc/include
+    ../inc
+    ${CUST_PKG_PATH}/include
+)
+
+# add host lib path
+link_directories(
+    ${LIB_PATH}
+    ${LIB_PATH1}
+    ${CUST_PKG_PATH}/lib
+)
+
+add_executable(execute_attention_fusion_op
+    operator_desc.cpp
+    op_runner.cpp
+    main.cpp
+    op_runner.cpp
+    common.cpp
+)
+
+target_link_libraries(execute_attention_fusion_op
+    ascendcl
+    cust_opapi
+    acl_op_compiler
+    nnopbase
+    stdc++
+)
+
+install(TARGETS execute_attention_fusion_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/src/common.cpp b/cust_op/attention_fusion/aclnn_attention_fusion/src/common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47a39cd5f190b0472542a2037146d54fc3b5eb0b
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/src/common.cpp
@@ -0,0 +1,79 @@
+/**
+* @file common.cpp
+*
+* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+#include "common.h"
+
+#include <fstream>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+extern bool g_isDevice;
+
+bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file %s", filePath.c_str());
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    auto writeSize = write(fd, buffer, size);
+    (void) close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/src/main.cpp b/cust_op/attention_fusion/aclnn_attention_fusion/src/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76a8f61c7038e83a1431d605d1ab264e8843fefa
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/src/main.cpp
@@ -0,0 +1,180 @@
+/**
+* @file main.cpp
+*
+* Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+#include <cstdint>
+#include <iostream>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "acl/acl.h"
+#include "op_runner.h"
+
+#include "common.h"
+
+bool g_isDevice = false;
+int deviceId = 0;
+
+OperatorDesc CreateOpDesc()
+{
+    // define operator
+    std::vector<int64_t> shapeQuery { 1024, 1000, 80 };
+    std::vector<int64_t> shapeKey { 1024, 50, 80 };
+    std::vector<int64_t> shapeValue { 1024, 50, 80 };
+    std::vector<int64_t> shapeAttenMask { 1024, 1000, 50 };
+    std::vector<int64_t> shapeAttenScore{ 1024, 1000, 80  };
+    std::vector<int64_t> shapeSoftmaxOut {1024, 1000, 50  };
+    aclDataType dataTypeQuery = ACL_FLOAT;
+    aclDataType dataTypeKey = ACL_FLOAT;
+    aclDataType dataTypeValue = ACL_FLOAT;
+    aclDataType dataTypeAttenMask = ACL_FLOAT;
+    aclDataType dataTypeAttenScore = ACL_FLOAT;
+    aclDataType dataTypeSoftmaxOut = ACL_FLOAT;
+    aclFormat format = ACL_FORMAT_ND;
+    OperatorDesc opDesc;
+    opDesc.maskOnOptional = 1;
+    opDesc.AddInputTensorDesc(dataTypeQuery, shapeQuery.size(), shapeQuery.data(), format);
+    opDesc.AddInputTensorDesc(dataTypeKey, shapeKey.size(), shapeKey.data(), format);
+    opDesc.AddInputTensorDesc(dataTypeValue, shapeValue.size(), shapeValue.data(), format);
+    opDesc.AddInputTensorDesc(dataTypeAttenMask, shapeAttenMask.size(), shapeAttenMask.data(), format);
+    opDesc.AddOutputTensorDesc(dataTypeAttenScore, shapeAttenScore.size(), shapeAttenScore.data(), format);
+    opDesc.AddOutputTensorDesc(dataTypeSoftmaxOut, shapeSoftmaxOut.size(), shapeSoftmaxOut.data(), format);
+    return opDesc;
+}
+
+bool SetInputData(OpRunner &runner)
+{
+    size_t fileSize = 0;
+    ReadFile("../input/input_query.bin", fileSize, runner.GetInputBuffer<void>(0), runner.GetInputSize(0));
+    ReadFile("../input/input_key.bin", fileSize, runner.GetInputBuffer<void>(1), runner.GetInputSize(1));
+    ReadFile("../input/input_value.bin", fileSize, runner.GetInputBuffer<void>(2), runner.GetInputSize(2));
+    ReadFile("../input/input_atten_mask.bin", fileSize, runner.GetInputBuffer<void>(3), runner.GetInputSize(3));
+    INFO_LOG("Set input success");
+    return true;
+}
+
+bool ProcessOutputData(OpRunner &runner)
+{
+    WriteFile("../output/output_atten_score.bin", runner.GetOutputBuffer<void>(0), runner.GetOutputSize(0));
+    WriteFile("../output/output_softmax_out.bin", runner.GetOutputBuffer<void>(1), runner.GetOutputSize(1));
+    INFO_LOG("Write output success");
+    return true;
+}
+
+void DestoryResource()
+{
+    bool flag = false;
+    if (aclrtResetDevice(deviceId) != ACL_SUCCESS) {
+        ERROR_LOG("Reset device %d failed", deviceId);
+        flag = true;
+    }
+    INFO_LOG("Reset Device success");
+    if (aclFinalize() != ACL_SUCCESS) {
+        ERROR_LOG("Finalize acl failed");
+        flag = true;
+    }
+    if (flag) {
+        ERROR_LOG("Destory resource failed");
+    } else {
+        INFO_LOG("Destory resource success");
+    }
+}
+
+bool InitResource()
+{
+    std::string output = "../output";
+    if (access(output.c_str(), 0) == -1) {
+        int ret = mkdir(output.c_str(), 0700);
+        if (ret == 0) {
+            INFO_LOG("Make output directory successfully");
+        }
+        else {
+            ERROR_LOG("Make output directory fail");
+            return false;
+        }
+    }
+
+    // acl.json is dump or profiling config file
+    if (aclInit(NULL) != ACL_SUCCESS) {
+        ERROR_LOG("acl init failed");
+        return false;
+    }
+
+    if (aclrtSetDevice(deviceId) != ACL_SUCCESS) {
+        ERROR_LOG("Set device failed. deviceId is %d", deviceId);
+        (void)aclFinalize();
+        return false;
+    }
+    INFO_LOG("Set device[%d] success", deviceId);
+
+    // runMode is ACL_HOST which represents app is running in host
+    // runMode is ACL_DEVICE which represents app is running in device
+    aclrtRunMode runMode;
+    if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) {
+        ERROR_LOG("Get run mode failed");
+        DestoryResource();
+        return false;
+    }
+    g_isDevice = (runMode == ACL_DEVICE);
+    INFO_LOG("Get RunMode[%d] success", runMode);
+
+    return true;
+}
+
+bool RunOp()
+{
+    // create op desc
+    OperatorDesc opDesc = CreateOpDesc();
+
+    // create Runner
+    OpRunner opRunner(&opDesc);
+    if (!opRunner.Init()) {
+        ERROR_LOG("Init OpRunner failed");
+        return false;
+    }
+
+    // Load inputs
+    if (!SetInputData(opRunner)) {
+        ERROR_LOG("Set input data failed");
+        return false;
+    }
+
+    // Run op
+    if (!opRunner.RunOp()) {
+        ERROR_LOG("Run op failed");
+        return false;
+    }
+
+    // process output data
+    if (!ProcessOutputData(opRunner)) {
+        ERROR_LOG("Process output data failed");
+        return false;
+    }
+
+    INFO_LOG("Run op success");
+    return true;
+}
+
+int main(int argc, char **argv)
+{
+    if (!InitResource()) {
+        ERROR_LOG("Init resource failed");
+        return FAILED;
+    }
+    INFO_LOG("Init resource success");
+
+    if (!RunOp()) {
+        DestoryResource();
+        return FAILED;
+    }
+
+    DestoryResource();
+
+    return SUCCESS;
+}
diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp b/cust_op/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0bba4a1b3d14066fd25d2aec906689015533b99e
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp
@@ -0,0 +1,466 @@
+/**
+* @file op_runner.cpp
+*
+* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+#include "op_runner.h"
+#include "aclnn_attention_fusion.h"
+#include <limits>
+#include <cassert>
+#include <chrono>
+#include "acl/acl_op_compiler.h"
+#include "common.h"
+
+using namespace std;
+
+extern bool g_isDevice;
+
+OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc)
+{
+    numInputs_ = opDesc->inputDesc.size();
+    numOutputs_ = opDesc->outputDesc.size();
+}
+
+OpRunner::~OpRunner()
+{
+    for (size_t i = 0; i < numInputs_; ++i) {
+        (void)aclDestroyTensor(inputTensor_[i]);
+        (void)aclDestroyDataBuffer(inputBuffers_[i]);
+        (void)aclrtFree(devInputs_[i]);
+        if (g_isDevice) {
+            (void)aclrtFree(hostInputs_[i]);
+        } else {
+            (void)aclrtFreeHost(hostInputs_[i]);
+        }
+    }
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        (void)aclDestroyTensor(outputTensor_[i]);
+        (void)aclDestroyDataBuffer(outputBuffers_[i]);
+        (void)aclrtFree(devOutputs_[i]);
+        if (g_isDevice) {
+            (void)aclrtFree(hostOutputs_[i]);
+        } else {
+            (void)aclrtFreeHost(hostOutputs_[i]);
+        }
+    }
+}
+
+bool OpRunner::Init()
+{
+    for (size_t i = 0; i < numInputs_; ++i) {
+        auto size = GetInputSize(i);
+        void *devMem = nullptr;
+        if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+            return false;
+        }
+        devInputs_.emplace_back(devMem);
+        inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size));
+
+        void *hostInput = nullptr;
+        if (g_isDevice) {
+            if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+                return false;
+            }
+        } else {
+            if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+                return false;
+            }
+        }
+        if (hostInput == nullptr) {
+            ERROR_LOG("Malloc memory for input[%zu] failed", i);
+            return false;
+        }
+        hostInputs_.emplace_back(hostInput);
+
+        aclTensor *inputTensor = aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i),
+            nullptr, 0, GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]);
+        if (inputTensor == nullptr) {
+            ERROR_LOG("Create Tensor for input[%zu] failed", i);
+            return false;
+        }
+        inputTensor_.emplace_back(inputTensor);
+    }
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        auto size = GetOutputSize(i);
+        void *devMem = nullptr;
+        if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+            return false;
+        }
+        devOutputs_.emplace_back(devMem);
+        outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size));
+
+        void *hostOutput = nullptr;
+        if (g_isDevice) {
+            if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+                return false;
+            }
+        } else {
+            if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+                return false;
+            }
+        }
+        if (hostOutput == nullptr) {
+            ERROR_LOG("Malloc host memory for output[%zu] failed", i);
+            return false;
+        }
+        hostOutputs_.emplace_back(hostOutput);
+
+        aclTensor *outputTensor = aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i),
+            nullptr, 0, GetOutputFormat(i), GetOutputShape(i).data(), GetOutputNumDims(i), devOutputs_[i]);
+        if (outputTensor == nullptr) {
+            ERROR_LOG("Create Tensor for output[%zu] failed", i);
+            return false;
+        }
+        outputTensor_.emplace_back(outputTensor);
+    }
+
+    return true;
+}
+
+const size_t OpRunner::NumInputs()
+{
+    return numInputs_;
+}
+
+const size_t OpRunner::NumOutputs()
+{
+    return numOutputs_;
+}
+
+const size_t OpRunner::GetInputSize(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescSize(opDesc_->inputDesc[index]);
+}
+
+const size_t OpRunner::GetInputNumDims(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescNumDims(opDesc_->inputDesc[index]);
+}
+
+aclDataType OpRunner::GetInputDataType(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ACL_DT_UNDEFINED;
+    }
+
+    return aclGetTensorDescType(opDesc_->inputDesc[index]);
+}
+
+aclFormat OpRunner::GetInputFormat(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ACL_FORMAT_UNDEFINED;
+    }
+
+    return aclGetTensorDescFormat(opDesc_->inputDesc[index]);
+}
+
+std::vector<int64_t> OpRunner::GetInputShape(size_t index) const
+{
+    std::vector<int64_t> ret;
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ret;
+    }
+
+    auto desc = opDesc_->inputDesc[index];
+    for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) {
+        int64_t dimSize;
+        if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) {
+            ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i);
+            ret.clear();
+            return ret;
+        }
+        ret.emplace_back(dimSize);
+    }
+
+    return ret;
+}
+
+size_t OpRunner::GetOutputSize(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescSize(opDesc_->outputDesc[index]);
+}
+
+const size_t OpRunner::GetOutputNumDims(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescNumDims(opDesc_->outputDesc[index]);
+}
+
+aclDataType OpRunner::GetOutputDataType(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ACL_DT_UNDEFINED;
+    }
+
+    return aclGetTensorDescType(opDesc_->outputDesc[index]);
+}
+
+
+aclFormat OpRunner::GetOutputFormat(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ACL_FORMAT_UNDEFINED;
+    }
+
+    return aclGetTensorDescFormat(opDesc_->outputDesc[index]);
+}
+
+std::vector<int64_t> OpRunner::GetOutputShape(size_t index) const
+{
+    std::vector<int64_t> ret;
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ret;
+    }
+
+    auto desc = opDesc_->outputDesc[index];
+    for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) {
+        int64_t dimSize;
+        if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) {
+            ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i);
+            ret.clear();
+            return ret;
+        }
+        ret.emplace_back(dimSize);
+    }
+    return ret;
+}
+
+size_t OpRunner::GetInputElementCount(size_t index) const
+{
+    if (index >= opDesc_->inputDesc.size()) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescElementCount(opDesc_->inputDesc[index]);
+}
+
+size_t OpRunner::GetOutputElementCount(size_t index) const
+{
+    if (index >= opDesc_->outputDesc.size()) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescElementCount(opDesc_->outputDesc[index]);
+}
+
+bool OpRunner::RunOp()
+{
+    for (size_t i = 0; i < numInputs_; ++i) {
+        auto size = GetInputSize(i);
+        aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE;
+        if (g_isDevice) {
+            kind = ACL_MEMCPY_DEVICE_TO_DEVICE;
+        }
+        if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) {
+            ERROR_LOG("Copy input[%zu] failed", i);
+            return false;
+        }
+        INFO_LOG("Copy input[%zu] success", i);
+    }
+
+    aclrtStream stream = nullptr;
+    if (aclrtCreateStream(&stream) != ACL_SUCCESS) {
+        ERROR_LOG("Create stream failed");
+        return false;
+    }
+    INFO_LOG("Create stream success");
+
+    size_t workspaceSize = 0;
+	aclOpExecutor *handle = nullptr;
+	auto ret = aclnnAttentionFusionGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3],
+                                                    opDesc_->maskOnOptional, outputTensor_[0], outputTensor_[1],
+                                                    &workspaceSize, &handle);
+    if (ret != ACL_SUCCESS) {
+        (void)aclrtDestroyStream(stream);
+        ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast<int32_t>(ret));
+        return false;
+    }
+	INFO_LOG("Execute aclnnAttentionFusionGetWorkspaceSize success, workspace size %lu", workspaceSize);
+    
+    void *workspace = nullptr;
+    if (workspaceSize != 0) {
+        if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory failed");
+        }
+    }
+
+    ret = aclnnAttentionFusion(workspace, workspaceSize, handle, stream);
+    if (ret != ACL_SUCCESS) {
+        (void)aclrtDestroyStream(stream);
+        ERROR_LOG("Execute Operator failed. error code is %d", static_cast<int32_t>(ret));
+        return false;
+    }
+	INFO_LOG("Execute aclnnAttentionFusion success");
+
+    ret = aclrtSynchronizeStreamWithTimeout(stream, 5000);
+    if (ret != SUCCESS) {
+        ERROR_LOG("Synchronize stream failed. error code is %d", static_cast<int32_t>(ret));
+        (void)aclrtDestroyStream(stream);
+        return false;
+    }
+    INFO_LOG("Synchronize stream success");
+
+    auto beforeTime = std::chrono::steady_clock::now();
+    for (int i = 0; i<100; i++) {
+        ret = aclnnAttentionFusionGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3],
+                                                    opDesc_->maskOnOptional, outputTensor_[0], outputTensor_[1],
+                                                    &workspaceSize, &handle);
+        ret = aclnnAttentionFusion(workspace, workspaceSize, handle, stream);
+    }
+    ret = aclrtSynchronizeStreamWithTimeout(stream, 5000);
+    auto afterTime = std::chrono::steady_clock::now();
+    double duration_microsecond = std::chrono::duration<double, std::micro>(afterTime - beforeTime).count();
+	std::cout << "time cost " << duration_microsecond/100 << " us" << std::endl;
+    
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        auto size = GetOutputSize(i);
+        aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST;
+        if (g_isDevice) {
+            kind = ACL_MEMCPY_DEVICE_TO_DEVICE;
+        }
+        if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) {
+            INFO_LOG("Copy output[%zu] success", i);
+            (void)aclrtDestroyStream(stream);
+            return false;
+        }
+        INFO_LOG("Copy output[%zu] success", i);
+    }
+
+    (void)aclrtDestroyStream(stream);
+    return true;
+}
+
+
+template<typename T>
+void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case ACL_BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT8:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT8:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT16:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT16:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT32:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT32:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT64:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT64:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_FLOAT16:
+            DoPrintFp16Data(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case ACL_FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case ACL_DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+}
+
+void OpRunner::PrintInput(size_t index, size_t numElementsPerRow)
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_);
+        return;
+    }
+
+    auto desc = opDesc_->inputDesc[index];
+    PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow);
+}
+
+void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow)
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return;
+    }
+
+    auto desc = opDesc_->outputDesc[index];
+    PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow);
+}
diff --git a/cust_op/attention_fusion/aclnn_attention_fusion/src/operator_desc.cpp b/cust_op/attention_fusion/aclnn_attention_fusion/src/operator_desc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7a1ed3bd9f6b33a7a697c9aa67045112093081a
--- /dev/null
+++ b/cust_op/attention_fusion/aclnn_attention_fusion/src/operator_desc.cpp
@@ -0,0 +1,56 @@
+/**
+* @file operator_desc.cpp
+*
+* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+#include "common.h"
+#include "operator_desc.h"
+
+using namespace std;
+
+OperatorDesc::OperatorDesc() {}
+
+OperatorDesc::~OperatorDesc()
+{
+    for (auto *desc : inputDesc) {
+        aclDestroyTensorDesc(desc);
+    }
+
+    for (auto *desc : outputDesc) {
+        aclDestroyTensorDesc(desc);
+    }
+
+}
+
+OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType,
+                                               int numDims,
+                                               const int64_t *dims,
+                                               aclFormat format)
+{
+    aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format);
+    if (desc == nullptr) {
+        ERROR_LOG("create tensor failed");
+        return *this;
+    }
+    inputDesc.emplace_back(desc);
+    return *this;
+}
+
+OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType,
+                                                int numDims,
+                                                const int64_t *dims,
+                                                aclFormat format)
+{
+    aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format);
+    if (desc == nullptr) {
+        ERROR_LOG("create tensor failed");
+        return *this;
+    }
+
+    outputDesc.emplace_back(desc);
+    return *this;
+}
diff --git a/cust_op/attention_fusion/attention_fusion.json b/cust_op/attention_fusion/attention_fusion.json
new file mode 100644
index 0000000000000000000000000000000000000000..130f1c66d9ca5430ac08b16548bea6620ab4cc96
--- /dev/null
+++ b/cust_op/attention_fusion/attention_fusion.json
@@ -0,0 +1,78 @@
+[
+    {
+        "op": "AttentionFusion",
+        "language": "cpp",
+        "input_desc": [
+            {
+                "name": "query",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            },
+            {
+                "name": "key",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            },
+            {
+                "name": "value",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            },
+            {
+                "name": "attn_mask",
+                "param_type": "optional",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            }
+        ],
+        "output_desc": [
+            {
+                "name": "atten_score",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            },
+            {
+                "name": "softmax_out",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            }
+        ],
+        "attr": [
+            {
+              "name": "mask_on",
+			  "param_type": "optional",
+              "type": "int",
+              "default_value": 0
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/cust_op/attention_fusion/op_host/CMakeLists.txt b/cust_op/attention_fusion/op_host/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..40dd51cfac524b0a9607b7d8b2813edd2210c509
--- /dev/null
+++ b/cust_op/attention_fusion/op_host/CMakeLists.txt
@@ -0,0 +1,82 @@
+
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs)
+
+opbuild(OPS_SRC ${ops_srcs}
+        OUT_DIR ${ASCEND_AUTOGEN_PATH}
+)
+
+add_library(cust_op_proto SHARED ${ops_srcs} ${ASCEND_AUTOGEN_PATH}/op_proto.cc)
+target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB)
+target_compile_options(cust_op_proto PRIVATE
+        -fvisibility=hidden
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_op_proto PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_op_proto PRIVATE
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME
+                      cust_opsproto_rt2.0
+)
+add_library(cust_optiling SHARED ${ops_srcs})
+target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB)
+target_compile_options(cust_optiling PRIVATE
+        -fvisibility=hidden
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_optiling PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_optiling PRIVATE
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME
+                      cust_opmaster_rt2.0
+)
+
+file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp)
+file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h)
+add_library(cust_opapi SHARED ${aclnn_src})
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_opapi PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase)
+
+add_custom_target(optiling_compat ALL
+                  COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$<TARGET_FILE_NAME:cust_optiling>
+                          ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+)
+
+install(TARGETS cust_op_proto
+        LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h
+        DESTINATION packages/vendors/${vendor_name}/op_proto/inc)
+install(TARGETS cust_optiling
+        LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+        DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling)
+install(TARGETS cust_opapi
+        LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib)
+install(FILES ${aclnn_inc}
+        DESTINATION packages/vendors/${vendor_name}/op_api/include)
diff --git a/cust_op/attention_fusion/op_host/attention_fusion.cpp b/cust_op/attention_fusion/op_host/attention_fusion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f2b8823faacd4f54bdb6ce5bc756548dd8c9b06
--- /dev/null
+++ b/cust_op/attention_fusion/op_host/attention_fusion.cpp
@@ -0,0 +1,275 @@
+#include <cstdint>
+#include <cmath>
+#include "attention_fusion_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+
+#define TEST_LOG(fmt, args...) fprintf(stdout, fmt "\n", ##args)
+
+namespace optiling {
+#define ge::GRAPH_FAILED 1
+#define ge::GRAPH_SUCCESS 0
+#define ALIGN_32 (32 / sizeof(float))
+#define RESERVER_UB_SIZE (20 * 1024)
+#define ALREADY_ALIGNED 1
+#define SPECIAL_CASE 2
+#define SPECIAL_ROW_SIZE (16 * 16)
+#define SPECIAL_Q_DIM1 500
+#define SPECIAL_K_DIM1 50
+#define ORIG_UNPAD_DIM0 16
+#define ORIG_UNPAD_DIM1 (16 * 50)
+#define TRANSPOSED_PADDED_DIM0 (56 * 16)
+#define TRANSPOSED_PADDED_DIM1 16
+#define ORIG_PADDED_DIM0 16
+#define ORIG_PADDED_DIM1 (56 * 16)
+#define TRANSPOSED_UNPAD_DIM0 (16 * 50)
+#define TRANSPOSED_UNPAD_DIM1 16
+#define UB_TILES 3
+#define DIM0 0
+#define DIM1 1
+#define DIM2 2
+
+static int32_t MatmulTiling(gert::TilingContext* context, AttentionFusionTilingData &tilingData)
+{
+    // q (B, M, K) k (B, N, K) v (B, V, K)
+    auto qShape = context->GetInputShape(0)->GetStorageShape();
+    auto kShape = context->GetInputShape(1)->GetStorageShape();
+    auto vShape = context->GetInputShape(2)->GetStorageShape();
+
+    auto ascnedPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    size_t coreNum = ascnedPlatform.GetCoreNumAic();
+    // qkMatmul configuration
+    matmul_tiling::MultiCoreMatmulTiling qkMm(ascnedPlatform);
+    qkMm.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    qkMm.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    qkMm.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    qkMm.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    qkMm.SetShape(qShape.GetDim(DIM1), kShape.GetDim(DIM1), kShape.GetDim(DIM2));
+
+    qkMm.SetSingleShape(qShape.GetDim(DIM1), kShape.GetDim(DIM1), kShape.GetDim(DIM2));
+
+    qkMm.SetBias(false);
+    qkMm.SetBufferSpace(-1, -1, -1);
+    qkMm.SetDim(coreNum);
+
+    // kvBmm Matmul Tilling
+    matmul_tiling::MultiCoreMatmulTiling kvMm(ascnedPlatform);
+    kvMm.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    kvMm.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    kvMm.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    kvMm.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    kvMm.SetShape(qShape.GetDim(DIM1), vShape.GetDim(DIM2), kShape.GetDim(DIM1));
+    kvMm.SetSingleShape(qShape.GetDim(DIM1), vShape.GetDim(DIM2), kShape.GetDim(DIM1));
+
+    kvMm.SetBias(false);
+    kvMm.SetBufferSpace(-1, -1, -1);
+    kvMm.SetDim(coreNum);
+
+    // Get tilingData using on the kernel side 
+    if (qkMm.GetTiling(tilingData.qkMatmulTiling) == -1 ||
+        kvMm.GetTiling(tilingData.kvMatmulTiling) == -1) {
+        return ge::GRAPH_FAILED;
+    }
+    return ge::GRAPH_SUCCESS;
+}
+
+static int32_t SoftmaxTiling(gert::TilingContext* context, AttentionFusionTilingData &tilingData, uint64_t ub)
+{
+    auto qShape = context->GetInputShape(0)->GetStorageShape();
+    auto kShape = context->GetInputShape(1)->GetStorageShape();
+    auto vShape = context->GetInputShape(2)->GetStorageShape();
+    const int32_t* maskIsOn = context->GetAttrs()->GetAttrPointer<int32_t>(0);
+
+    int numOfelement = kShape.GetDim(DIM1) / ALIGN_32;
+    uint8_t attr = 0;
+    int normalizeColumn = 0;
+
+    // Get column with 32bytes alignment
+    if ((kShape.GetDim(DIM1) % ALIGN_32) == 0) {
+        normalizeColumn = numOfelement * ALIGN_32;
+        attr = ALREADY_ALIGNED;
+    } else if (kShape.GetDim(DIM1) == SPECIAL_K_DIM1 && qShape.GetDim(DIM1) > SPECIAL_Q_DIM1) {
+        normalizeColumn = numOfelement * ALIGN_32 + ALIGN_32;
+        attr = SPECIAL_CASE;
+    } else {
+        normalizeColumn = numOfelement * ALIGN_32 + ALIGN_32;
+    }
+
+    if ((sizeof(float) * normalizeColumn) > (ub / UB_TILES)) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // Get how many rows that half of ub contains
+    int normalizeRow = ub / UB_TILES / (sizeof(float) * normalizeColumn);
+    if (normalizeRow > qShape.GetDim(DIM1)) {
+        normalizeRow = qShape.GetDim(DIM1);
+    }
+
+    if (attr == SPECIAL_CASE) {
+        normalizeRow = SPECIAL_ROW_SIZE;
+    }
+
+    // Get max Ub left for softmax shared tmp buffer
+    uint64_t maxLocalWorkSize = ub - (normalizeRow * normalizeColumn * sizeof(float) * 2);
+
+    const ge::Shape softmaxShape({normalizeRow, normalizeColumn});
+    const uint32_t minLocalWorkSize = AscendC::GetSoftMaxMinTmpSize(softmaxShape, sizeof(float), false);
+    if (minLocalWorkSize > maxLocalWorkSize) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // divisor should not be 0
+    if (normalizeRow == 0) {
+        return ge::GRAPH_FAILED;
+    }
+    int normalizeLoop = qShape.GetDim(DIM1) / normalizeRow;
+    normalizeLoop = ((qShape.GetDim(DIM1) % normalizeRow) == 0) ? normalizeLoop : normalizeLoop + 1;
+    float res = sqrt(qShape.GetDim(DIM2));
+    float dimSqrt = (res != 0) ? (1 / res) : 0;
+
+    // set tiling data
+    tilingData.set_normalizeAttr(attr);
+    tilingData.set_attnDim(qShape.GetDim(DIM2));
+    tilingData.set_queryDim1(qShape.GetDim(DIM1));
+    tilingData.set_queryDim2(qShape.GetDim(DIM2));
+    tilingData.set_keyDim1(kShape.GetDim(DIM1));
+    tilingData.set_valueDim2(vShape.GetDim(DIM2));
+    tilingData.set_batchNum(qShape.GetDim(DIM0));
+    tilingData.set_normalizeLoop(normalizeLoop);
+    tilingData.set_normalizeRow(normalizeRow);
+    tilingData.set_normalizeColumn(normalizeColumn);
+    tilingData.set_maskIsOn(*maskIsOn);
+    tilingData.set_normalizeSqrt(dimSqrt);
+    tilingData.set_maxSharedTmpBuf(maxLocalWorkSize);
+
+    AscendC::SoftMaxTilingFunc(softmaxShape, sizeof(float), maxLocalWorkSize, tilingData.softMaxTilingData);
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus TilingFunc(gert::TilingContext* context)
+{
+    AttentionFusionTilingData tilingData;
+
+    // Platform configuration
+    auto ascnedPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+    size_t systemWorkspacesSize = ascnedPlatform.GetLibApiWorkSpaceSize();
+    currentWorkspace[0] = 0 + systemWorkspacesSize;
+    size_t coreNum = ascnedPlatform.GetCoreNumAic();
+
+    uint64_t ub;
+    ascnedPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub);
+    ub = ub - RESERVER_UB_SIZE;
+
+    if (MatmulTiling(context, tilingData) != ge::GRAPH_SUCCESS ||
+        SoftmaxTiling(context, tilingData, ub) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // Get tiling data for transposing the origin tensor, then the transposed tensor is going to be padded.
+    std::vector<int64_t> shapeVec = {ORIG_UNPAD_DIM0, ORIG_UNPAD_DIM1};
+    ge::Shape srcShape(shapeVec);
+    AscendC::GetConfusionTransposeTilingInfo(srcShape, 0, sizeof(float), 7, tilingData.confusionTransposeTilingData);
+
+    // Get tiling data for transposing the padded tensor.
+    std::vector<int64_t> shapeVec1 = {TRANSPOSED_PADDED_DIM0, TRANSPOSED_PADDED_DIM1};
+    ge::Shape srcShape1(shapeVec1);
+    AscendC::GetConfusionTransposeTilingInfo(srcShape1, 0, sizeof(float), 7, tilingData.confusionTransposeTilingData1);
+
+    // Get tiling data for transposing the padded tensor, then the transposed tensor is going to be unpadded.
+    std::vector<int64_t> shapeVec2 = {ORIG_PADDED_DIM0, ORIG_PADDED_DIM1};
+    ge::Shape srcShape2(shapeVec2);
+    AscendC::GetConfusionTransposeTilingInfo(srcShape2, 0, sizeof(float), 7, tilingData.confusionTransposeTilingData2);
+
+    // Get tiling data for transposing the unpadded tensor back to original shape.
+    std::vector<int64_t> shapeVec3 = {TRANSPOSED_UNPAD_DIM0, TRANSPOSED_UNPAD_DIM1};
+    ge::Shape srcShape3(shapeVec3);
+    AscendC::GetConfusionTransposeTilingInfo(srcShape3, 0, sizeof(float), 7, tilingData.confusionTransposeTilingData3);
+
+    context->SetBlockDim(coreNum);
+    tilingData.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+    context->GetRawTilingData()->SetDataSize(tilingData.GetDataSize());
+    return ge::GRAPH_SUCCESS;
+}
+}
+
+
+namespace ge {
+static ge::graphStatus InferShape(gert::InferShapeContext* context)
+{
+    const gert::Shape* qShape = context->GetInputShape(0);
+    const gert::Shape* kShape = context->GetInputShape(1);
+    const gert::Shape* vShape = context->GetInputShape(2);
+
+    gert::Shape* attnScoreShape = context->GetOutputShape(0);
+    gert::Shape* softmaxOutShape = context->GetOutputShape(1);
+
+    attnScoreShape->SetDimNum(3);
+    attnScoreShape->SetDim(0, qShape->GetDim(DIM0));
+    attnScoreShape->SetDim(1, qShape->GetDim(DIM1));
+    attnScoreShape->SetDim(2, vShape->GetDim(DIM2));
+
+    softmaxOutShape->SetDimNum(3);
+    softmaxOutShape->SetDim(0, qShape->GetDim(DIM0));
+    softmaxOutShape->SetDim(1, qShape->GetDim(DIM1));
+    softmaxOutShape->SetDim(2, kShape->GetDim(DIM1));
+
+    return GRAPH_SUCCESS;
+}
+static ge::graphStatus InferDtype(gert::InferDataTypeContext* context)
+{
+    context->SetOutputDataType(0, context->GetInputDataType(0));
+    context->SetOutputDataType(1, context->GetInputDataType(1));
+    return GRAPH_SUCCESS;
+}
+}
+
+
+namespace ops {
+class AttentionFusion : public OpDef {
+public:
+    explicit AttentionFusion(const char* name) : OpDef(name)
+    {
+        this->Input("query")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Input("key")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Input("value")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Input("attn_mask")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Output("atten_score")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Output("softmax_out")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Attr("mask_on").Int();
+
+        this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDtype);
+
+        this->AICore()
+            .SetTiling(optiling::TilingFunc);
+        this->AICore().AddConfig("ascend910b");
+        this->AICore().AddConfig("ascend910");
+    }
+};
+
+OP_ADD(AttentionFusion);
+}
diff --git a/cust_op/attention_fusion/op_host/attention_fusion_tiling.h b/cust_op/attention_fusion/op_host/attention_fusion_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a6695e67b4fd172609728ab9b7c36e58c9c0cfb
--- /dev/null
+++ b/cust_op/attention_fusion/op_host/attention_fusion_tiling.h
@@ -0,0 +1,28 @@
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+namespace optiling {
+    BEGIN_TILING_DATA_DEF(AttentionFusionTilingData)
+        TILING_DATA_FIELD_DEF(uint8_t, normalizeAttr);
+        TILING_DATA_FIELD_DEF(float, attnDim);
+        TILING_DATA_FIELD_DEF(int32_t, queryDim1);
+        TILING_DATA_FIELD_DEF(int32_t, queryDim2);
+        TILING_DATA_FIELD_DEF(int32_t, keyDim1);
+        TILING_DATA_FIELD_DEF(int32_t, valueDim2);
+        TILING_DATA_FIELD_DEF(int32_t, batchNum);
+        TILING_DATA_FIELD_DEF(int32_t, normalizeLoop);
+        TILING_DATA_FIELD_DEF(int32_t, normalizeRow);
+        TILING_DATA_FIELD_DEF(int32_t, normalizeColumn);
+        TILING_DATA_FIELD_DEF(int32_t, maskIsOn);
+        TILING_DATA_FIELD_DEF(float, normalizeSqrt);
+        TILING_DATA_FIELD_DEF(uint64_t, maxSharedTmpBuf);
+        TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, qkMatmulTiling);
+        TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, kvMatmulTiling);
+        TILING_DATA_FIELD_DEF_STRUCT(SoftMaxTiling, softMaxTilingData);
+        TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData);
+        TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData1);
+        TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData2);
+        TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData3);
+    END_TILING_DATA_DEF;
+
+    REGISTER_TILING_DATA_CLASS(AttentionFusion, AttentionFusionTilingData)
+}
diff --git a/cust_op/attention_fusion/op_kernel/CMakeLists.txt b/cust_op/attention_fusion/op_kernel/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2f0e48294cfb99987502b97d46ef9ff85bc7ed7f
--- /dev/null
+++ b/cust_op/attention_fusion/op_kernel/CMakeLists.txt
@@ -0,0 +1,60 @@
+# set custom compile options
+if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
+    add_ops_compile_options(ALL OPTIONS -g -O0)
+endif()
+
+foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+
+    # generate aic-${compute_unit}-ops-info.json
+    add_ops_info_target(TARGET ops_info_gen_${compute_unit}
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json
+        OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+        INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}
+    )
+
+    # generate ascendc impl py once
+    if (NOT TARGET ascendc_impl_gen)
+        add_ops_impl_target(TARGET ascendc_impl_gen
+            OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+            IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR}
+            OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe
+            INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl
+        )
+    endif()
+
+    # dynamic shape binary compile
+    if (${ENABLE_BINARY_PACKAGE})
+        add_bin_compile_target(TARGET ascendc_bin_${compute_unit}
+            OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+            IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR}
+            ADP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/dynamic
+            OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+            INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
+            COMPUTE_UNIT ${compute_unit}
+        )
+        add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen)
+    endif()
+endforeach()
+
+# generate npu_supported_ops.json
+add_npu_support_target(TARGET npu_supported_ops
+    OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH}
+    OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core
+    INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE}
+)
+
+if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+endif()
+
+# install kernel file
+if (${ENABLE_SOURCE_PACKAGE})
+    file(GLOB KERNEL_FILES
+         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/*.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/*.py
+    )
+    install(FILES ${KERNEL_FILES}
+            DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
+    )
+endif()
diff --git a/cust_op/attention_fusion/op_kernel/attention_fusion.cpp b/cust_op/attention_fusion/op_kernel/attention_fusion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe2c88db200763a0d71b3e28851586e901ca3af5
--- /dev/null
+++ b/cust_op/attention_fusion/op_kernel/attention_fusion.cpp
@@ -0,0 +1,24 @@
+#include "kernel_operator.h"
+#include "attention_fusion_kernel.h"
+using namespace AscendC;
+
+// call of kernel function
+extern "C" __global__ __aicore__ void attention_fusion(GM_ADDR query, GM_ADDR key, GM_ADDR value, GM_ADDR attnMask, GM_ADDR attenScore, GM_ADDR softmaxOut, GM_ADDR workspace, GM_ADDR tiling) {
+    GET_TILING_DATA(tiling_data, tiling);
+
+    const TCubeTiling *qkMatmulTiling = &tiling_data.qkMatmulTiling;
+    const TCubeTiling *kvMatmulTiling = &tiling_data.kvMatmulTiling;
+    const SoftMaxTiling *softMaxTilingData = &tiling_data.softMaxTilingData;
+
+    AttentionFusionArgs args {
+        query, key, value, attnMask, attenScore, softmaxOut, tiling_data.normalizeAttr, tiling_data.queryDim1,
+        tiling_data.queryDim2, tiling_data.keyDim1, tiling_data.valueDim2, tiling_data.batchNum,
+        tiling_data.normalizeLoop, tiling_data.normalizeRow, tiling_data.normalizeColumn, tiling_data.maskIsOn,
+        tiling_data.normalizeSqrt, tiling_data.maxSharedTmpBuf, qkMatmulTiling, kvMatmulTiling,
+        softMaxTilingData, &tiling_data.confusionTransposeTilingData, &tiling_data.confusionTransposeTilingData1,
+        &tiling_data.confusionTransposeTilingData2, &tiling_data.confusionTransposeTilingData3
+    };
+
+    AttentionFusionKernel<float, float, float> kernel;
+    kernel.Compute(args);
+}
\ No newline at end of file
diff --git a/cust_op/attention_fusion/op_kernel/attention_fusion_kernel.h b/cust_op/attention_fusion/op_kernel/attention_fusion_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..63788fef9e2bf558dc9f034f3888cfa7a6a23f78
--- /dev/null
+++ b/cust_op/attention_fusion/op_kernel/attention_fusion_kernel.h
@@ -0,0 +1,161 @@
+#ifndef ATTENTION_FUSION_KERNEL_H
+#define ATTENTION_FUSION_KERNEL_H
+
+#include <cstdint>
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+#include "q_k_bmm_compute.h"
+#include "k_v_bmm_compute.h"
+#include "normalize_compute.h"
+
+using namespace AscendC;
+
+struct AttentionFusionArgs {
+    GM_ADDR query;
+    GM_ADDR key;
+    GM_ADDR value;
+    GM_ADDR attnMask;
+    GM_ADDR attenScore;
+    GM_ADDR softmaxOut;
+    
+    uint8_t normalizeAttr;
+    int32_t queryDim1;
+    int32_t queryDim2;
+    int32_t keyDim1;
+    int32_t valueDim2;
+    int32_t batchNum;
+    int32_t normalizeLoop;
+    int32_t normalizeRow;
+    int32_t normalizeColumn;
+    int32_t maskIsOn;
+    float normalizeSqrt;
+    uint64_t maxSharedTmpBuf;
+
+    const TCubeTiling* qkMatmulTiling;
+    const TCubeTiling* kvMatmulTiling;
+    const SoftMaxTiling* softMaxTilingData;
+
+    const ConfusionTransposeTiling* confusionTransposeTilingData;
+    const ConfusionTransposeTiling* confusionTransposeTilingData1;
+    const ConfusionTransposeTiling* confusionTransposeTilingData2;
+    const ConfusionTransposeTiling* confusionTransposeTilingData3;
+};
+
+struct AttentionFusionPipe {
+    TPipe* pipe;
+};
+
+template<typename T1, typename T2>
+__aicore__ inline T1 CeilDiv(T1 a, T2 b) {
+    if (b == 0) {
+        return 0;
+    }   
+    return (a + b -1) / b;
+}
+
+template<typename qType, typename kType, typename vType>
+class AttentionFusionKernel {
+    public:
+        __aicore__ inline AttentionFusionKernel() {};
+
+        __aicore__ inline void Compute(AttentionFusionArgs args)
+        {
+            // Args
+            this->args = args;
+
+            // Matmul Register
+            REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), qKBmmCompute.mm, args.qkMatmulTiling, kvBmmCompute.mm,
+                            args.kvMatmulTiling);
+
+            // batch offset
+            GetBatchOffsetAndLen(args.batchNum, this->batchOffset, this->batchLen);
+
+            // QKBmm Initialize 
+            QKBmmArgs qKBmmArgs {
+                args.query, args.key, args.softmaxOut,
+                args.queryDim1, args.keyDim1, args.queryDim2,
+                batchOffset, batchLen
+            };
+            QKBmmPipeArgs qKBmmPipeArgs {&pipe};
+            qKBmmCompute.Init(qKBmmArgs, qKBmmPipeArgs);
+
+            NormalizeArgs normalArgs {
+                &pipe, args.normalizeAttr, args.queryDim1, args.keyDim1, args.normalizeLoop, args.normalizeRow,
+                args.normalizeColumn, args.maskIsOn, args.normalizeSqrt, args.maxSharedTmpBuf, args.softMaxTilingData,
+                args.confusionTransposeTilingData, args.confusionTransposeTilingData1,
+                args.confusionTransposeTilingData2, args.confusionTransposeTilingData3
+            };
+            normalizeCompute.Init(normalArgs);
+
+            // KVBmm Initialize 
+            KVBmmArgs kvBmmArgs {
+                args.softmaxOut, args.value, args.attenScore,
+                args.queryDim1, args.valueDim2, args.keyDim1,
+                batchOffset
+            };
+            KVBmmPipeArgs kvBmmPipeArgs {&pipe};
+            kvBmmCompute.Init(kvBmmArgs, kvBmmPipeArgs);
+
+            // Start compute
+            Process();
+        }
+
+        __aicore__ inline void Process()
+        {
+            QKBmmComputePart();
+            NormalizeMatmulFusion();
+        }
+    private:
+        __aicore__ inline void QKBmmComputePart()
+        {
+            qKBmmCompute.Process();
+        }
+
+        __aicore__ inline void NormalizeMatmulFusion()
+        {
+            GlobalTensor<kType> softmaxOutGbTensorThisCore;
+            softmaxOutGbTensorThisCore.SetGlobalBuffer(reinterpret_cast<__gm__ qType*>(args.softmaxOut),
+                                                        batchLen * args.queryDim1 * args.keyDim1);
+            GlobalTensor<kType> softmaxGbMaskThisCore;
+            softmaxGbMaskThisCore.SetGlobalBuffer(reinterpret_cast<__gm__ qType*>(args.attnMask),
+                                                        batchLen * args.queryDim1 * args.keyDim1);
+            for(int i = 0; i < batchLen + 1; i++) {
+                if (i != batchLen) {
+                    GlobalTensor<kType> softmaxOutGbTensor =
+                        softmaxOutGbTensorThisCore[(batchOffset + i) * args.queryDim1 * args.keyDim1];
+                    GlobalTensor<kType> softmaxGbMaskTensor =
+                        softmaxGbMaskThisCore[(batchOffset + i) * args.queryDim1 * args.keyDim1];
+                    /* normallize */
+                    normalizeCompute.Process(softmaxOutGbTensor, softmaxGbMaskTensor);
+                }
+                
+                if (i != 0) {
+                    /* matmul */
+                    kvBmmCompute.ComputeOneBatch(i - 1);
+                }
+            }
+        }
+
+        __aicore__ inline void GetBatchOffsetAndLen(int batchNum, int& batchOffset, int& batchLen)
+        {
+            // batch offset
+            int blockLenPerCore = CeilDiv(batchNum, (GetBlockNum() * 2));
+            batchOffset = blockLenPerCore*GetBlockIdx();
+            batchLen = blockLenPerCore;
+            if (batchOffset + batchLen > batchNum) {
+                batchLen = batchNum - batchOffset;
+            }
+        }
+
+    private:
+        TPipe pipe;
+        AttentionFusionArgs args;
+        QKBmmCompute<qType, kType> qKBmmCompute;
+        KVBmmCompute<qType, vType> kvBmmCompute;
+        NormalizeCompute<qType> normalizeCompute;
+
+        int batchOffset;
+        int batchLen;
+};
+
+#endif
\ No newline at end of file
diff --git a/cust_op/attention_fusion/op_kernel/k_v_bmm_compute.h b/cust_op/attention_fusion/op_kernel/k_v_bmm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..16a7ca7f92b2e01201042721df83915071e4fac8
--- /dev/null
+++ b/cust_op/attention_fusion/op_kernel/k_v_bmm_compute.h
@@ -0,0 +1,69 @@
+#ifndef KV_BMMM_COMPUTE__H
+#define KV_BMMM_COMPUTE__H
+
+#include "kernel_operator.h"
+using namespace AscendC;
+
+
+struct KVBmmArgs {
+    GM_ADDR softmaxOut;
+    GM_ADDR value;
+    GM_ADDR out;
+    
+    int M;
+    int N;
+    int K;
+
+    int batchOffset;
+};
+
+struct KVBmmPipeArgs {
+    TPipe* pipe;
+};
+
+template<typename sType, typename vType>
+class KVBmmCompute {
+public:
+    __aicore__ inline KVBmmCompute(){}
+
+    __aicore__ inline void Init(KVBmmArgs kvBmmArgs, KVBmmPipeArgs pipeArgs)
+    {
+        this->kvBmmArgs = kvBmmArgs;
+        this->pipeArgs = pipeArgs;
+
+        // kernel batch offset
+        sGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ sType*>(kvBmmArgs.softmaxOut), kvBmmArgs.M * kvBmmArgs.K);
+        sGlobal = sGlobal[kvBmmArgs.batchOffset * kvBmmArgs.M * kvBmmArgs.K];
+        vGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ vType*>(kvBmmArgs.value),  kvBmmArgs.N * kvBmmArgs.K);
+        vGlobal = vGlobal[kvBmmArgs.batchOffset * kvBmmArgs.N * kvBmmArgs.K];
+        outGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ vType*>(kvBmmArgs.out), kvBmmArgs.M * kvBmmArgs.N);
+        outGlobal = outGlobal[kvBmmArgs.batchOffset * kvBmmArgs.M * kvBmmArgs.N];
+    }
+
+    __aicore__ inline void ComputeOneBatch(int batchI)
+    {
+        if (batchI != 0) {
+            mm.WaitIterateAll();
+            mm.End();
+        }
+
+        mm.SetTensorA(sGlobal[batchI * kvBmmArgs.M * kvBmmArgs.K]);
+        mm.SetTensorB(vGlobal[batchI * kvBmmArgs.N * kvBmmArgs.K]);
+
+        mm.template IterateAll<false>(outGlobal[batchI * kvBmmArgs.M * kvBmmArgs.N], 0, false, true);
+    }
+    
+    matmul::Matmul<
+        matmul::MatmulType<matmul::TPosition::GM, CubeFormat::ND, sType, false>,
+        matmul::MatmulType<matmul::TPosition::GM, CubeFormat::ND, vType, false>,
+        matmul::MatmulType<matmul::TPosition::GM, CubeFormat::ND, vType, false>,
+        matmul::MatmulType<matmul::TPosition::GM, CubeFormat::ND, vType>
+        > mm;
+private:
+    KVBmmArgs kvBmmArgs;
+    KVBmmPipeArgs pipeArgs;
+    GlobalTensor<sType> sGlobal;
+    GlobalTensor<vType> vGlobal;
+    GlobalTensor<vType> outGlobal;
+};
+#endif
\ No newline at end of file
diff --git a/cust_op/attention_fusion/op_kernel/normalize_compute.h b/cust_op/attention_fusion/op_kernel/normalize_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..94f44a1c9f69c3eb0c192d8eb9b3756106985bed
--- /dev/null
+++ b/cust_op/attention_fusion/op_kernel/normalize_compute.h
@@ -0,0 +1,241 @@
+#ifndef NORMALIZE_COMPUTE__H
+#define NORMALIZE_COMPUTE__H
+#include <cstdint>
+#include "kernel_operator.h"
+using namespace AscendC;
+#define ALIGN_32 32
+#define ALREADY_ALIGNED 1
+#define SPECIAL_CASE 2
+#define PAD_SIZE (16 * 56 * 16)
+#define PAD_SIZE (16 * 56 * 16)
+#define SPECIAL_BLOCK_COUNT 16
+#define SPECIAL_BLOCK_LEN (16 * 56 * 16)
+#define SPECIAL_STRIDE (6 * 16 / 8)
+#define PAD_VALUE -1000
+
+struct NormalizeArgs {
+    TPipe* pipe;
+
+    uint8_t attr;
+    int queryDim1;
+    int keyDim1;
+    int loopCount;
+    int normalizeRow;
+    int normalizeColumn;
+    int maskIsOn;
+    float normalizeSqrt;
+    uint64_t maxSharedTmpBuf;
+
+    const SoftMaxTiling* tiling;
+
+    const ConfusionTransposeTiling* confusionTransposeTilingData;
+    const ConfusionTransposeTiling* confusionTransposeTilingData1;
+    const ConfusionTransposeTiling* confusionTransposeTilingData2;
+    const ConfusionTransposeTiling* confusionTransposeTilingData3;
+};
+
+template<typename qType>
+class NormalizeCompute {
+public:
+    __aicore__ inline NormalizeCompute(){}
+
+    __aicore__ inline void Init(NormalizeArgs normalArgs)
+    {
+        this->args = normalArgs;
+        int bufSize = args.normalizeRow * args.normalizeColumn * sizeof(qType);
+        args.pipe->InitBuffer(vecInQueue, 1, bufSize);
+        args.pipe->InitBuffer(vecOutQueue, 1, bufSize);
+        args.pipe->InitBuffer(vecSharedQueue, 1, args.maxSharedTmpBuf);
+    }
+
+    __aicore__ inline void DoPadLocal(LocalTensor<qType>& sourceTensor, LocalTensor<qType>& mindTensor,
+                                        const ConfusionTransposeTiling* confusionTransposeTilingData,
+                                        const ConfusionTransposeTiling* confusionTransposeTilingData1)
+    {
+        ConfusionTransposeTiling tiling = *confusionTransposeTilingData;
+        ConfusionTranspose<qType>(mindTensor, sourceTensor, TransposeType::TRANSPOSE_ND2ND_ONLY, tiling);
+        Duplicate<float>(sourceTensor, PAD_VALUE, PAD_SIZE);
+        DataCopyParams dataCopyParam = {0, 0, 0, 0};
+        dataCopyParam.blockCount = SPECIAL_BLOCK_COUNT;
+        dataCopyParam.blockLen = SPECIAL_BLOCK_LEN;
+        dataCopyParam.srcStride = 0;
+        dataCopyParam.dstStride = SPECIAL_STRIDE;
+        DataCopy(sourceTensor, mindTensor, dataCopyParam);
+
+        ConfusionTransposeTiling tiling1 = *confusionTransposeTilingData1;
+        ConfusionTranspose<qType>(mindTensor, sourceTensor, TransposeType::TRANSPOSE_ND2ND_ONLY, tiling1);
+        DataCopy(sourceTensor, mindTensor, PAD_SIZE);
+    }
+
+
+    __aicore__ inline void DoUnPadLocal(LocalTensor<qType>& sourceTensor, LocalTensor<qType>& mindTensor,
+                                        const ConfusionTransposeTiling* confusionTransposeTilingData2,
+                                        const ConfusionTransposeTiling* confusionTransposeTilingData3)
+    {
+        ConfusionTransposeTiling tiling = *confusionTransposeTilingData2;
+        ConfusionTranspose<qType>(mindTensor, sourceTensor, TransposeType::TRANSPOSE_ND2ND_ONLY, tiling);
+        Duplicate<float>(sourceTensor, PAD_VALUE, PAD_SIZE);
+        DataCopyParams dataCopyParam = {0, 0, 0, 0};
+        dataCopyParam.blockCount = SPECIAL_BLOCK_COUNT;
+        dataCopyParam.blockLen = SPECIAL_BLOCK_LEN;
+        dataCopyParam.srcStride = SPECIAL_STRIDE;
+        dataCopyParam.dstStride = 0;
+        DataCopy(sourceTensor, mindTensor, dataCopyParam);
+
+        ConfusionTransposeTiling tiling1 = *confusionTransposeTilingData3;
+        ConfusionTranspose<qType>(mindTensor, sourceTensor, TransposeType::TRANSPOSE_ND2ND_ONLY, tiling1);
+        DataCopy(sourceTensor, mindTensor, PAD_SIZE);
+    }
+    
+    __aicore__ inline void Process(GlobalTensor<qType> softmaxGlobleTensor, GlobalTensor<qType> softmaxGbMask)
+    {
+        srcGloblePtr = softmaxGlobleTensor;
+        maskGloblePtr = softmaxGbMask;
+        offset = 0;
+        usedRowCount = 0;
+        uint8_t padLen = args.normalizeColumn - args.keyDim1;
+        padParams = {false, 0, padLen, 0};
+
+        for (int i = 0; i < args.loopCount; i++) {
+            /* Get height of softmax matrix and handle the last loop height */
+            height = ((args.queryDim1 - usedRowCount) < args.normalizeRow) ?
+                        args.queryDim1 - usedRowCount : args.normalizeRow;
+            totalSize = height * args.normalizeColumn;
+
+            CopyIn();
+            PreCompute();
+            Compute();
+            CopyOut();
+
+            usedRowCount += height;
+            offset += args.normalizeRow * args.keyDim1;
+        }
+    }
+
+private:
+    __aicore__ inline void CopyMask()
+    {
+        LocalTensor<qType> LocalMask = vecSharedQueue.AllocTensor<qType>();
+        if (args.attr == ALREADY_ALIGNED) {
+            DataCopy(LocalMask, maskGloblePtr[offset], totalSize);
+        } else if (args.attr == SPECIAL_CASE) {
+            DataCopy(LocalMask, maskGloblePtr[offset], totalSize);
+        } else {
+            DataCopyPad(LocalMask, maskGloblePtr[offset], copyParams, padParams);
+        }
+
+        vecSharedQueue.EnQue(LocalMask);
+    }
+
+    __aicore__ inline void CopyIn()
+    {
+        LocalTensor<qType> inLocalTensor = vecInQueue.AllocTensor<qType>();
+        LocalTensor<qType> outLocalTensor = vecOutQueue.AllocTensor<qType>();
+
+        if (args.attr == ALREADY_ALIGNED) {
+            DataCopy(inLocalTensor, srcGloblePtr[offset], totalSize);
+        } else if (args.attr == SPECIAL_CASE) {
+            DataCopy(inLocalTensor, srcGloblePtr[offset], totalSize);
+        } else {
+            copyParams.blockCount = height;
+            copyParams.blockLen = args.keyDim1 * sizeof(qType);
+            DataCopyPad(inLocalTensor, srcGloblePtr[offset], copyParams, padParams);
+        }
+
+        if (args.maskIsOn == 1) {
+            CopyMask();
+        }
+
+        vecInQueue.EnQue(inLocalTensor);
+        vecOutQueue.EnQue(outLocalTensor);
+    }
+
+    __aicore__ inline void PreCompute()
+    {
+        LocalTensor<qType> inLocalTensor = vecInQueue.DeQue<qType>();
+        LocalTensor<qType> outLocalTensor = vecOutQueue.DeQue<qType>();
+
+        if (args.attr == SPECIAL_CASE && args.maskIsOn == 1) {
+            LocalTensor<qType> LocalMask = vecSharedQueue.DeQue<qType>();
+            DoPadLocal(LocalMask, outLocalTensor, args.confusionTransposeTilingData,
+                                                    args.confusionTransposeTilingData1);
+            DoPadLocal(inLocalTensor, outLocalTensor, args.confusionTransposeTilingData,
+                                                        args.confusionTransposeTilingData1);
+            vecSharedQueue.EnQue(LocalMask);
+        } else if (args.attr == SPECIAL_CASE) {
+            DoPadLocal(inLocalTensor, outLocalTensor, args.confusionTransposeTilingData,
+                                                        args.confusionTransposeTilingData1);
+        }
+
+        // atten_weight = qkMatMul / sqrt(atten_dim)
+        Muls(inLocalTensor, inLocalTensor, args.normalizeSqrt, totalSize);
+
+        if (args.maskIsOn == 1) {
+            LocalTensor<qType> LocalMask = vecSharedQueue.DeQue<qType>();
+            // atten_mask = (1 - mask) * 10000
+            Muls(LocalMask, LocalMask, (float)-10000, totalSize);
+            Adds(LocalMask, LocalMask, (float)10000, totalSize);
+
+            // atten_weight = atten_weight + atten_mask
+            Add(inLocalTensor, inLocalTensor, LocalMask, totalSize);
+            vecSharedQueue.FreeTensor(LocalMask);
+        }
+
+        vecInQueue.EnQue<qType>(inLocalTensor);
+        vecOutQueue.EnQue<qType>(outLocalTensor);
+    }
+
+    __aicore__ inline void Compute()
+    {
+        LocalTensor<qType> inLocalTensor = vecInQueue.DeQue<qType>();
+        LocalTensor<qType> outLocalTensor = vecOutQueue.DeQue<qType>();
+        LocalTensor<uint8_t> sharedTmpBuf = vecSharedQueue.AllocTensor<uint8_t>();
+
+        SoftMaxShapeInfo scrShape ={height, (uint32_t)args.normalizeColumn, height, (uint32_t)args.keyDim1};
+        SoftMax<qType>(outLocalTensor, inLocalTensor, sharedTmpBuf, *args.tiling, scrShape);
+
+        if (args.attr == SPECIAL_CASE) {
+            DoUnPadLocal(outLocalTensor, inLocalTensor, args.confusionTransposeTilingData2,
+                                                        args.confusionTransposeTilingData3);
+        }
+        vecOutQueue.EnQue<qType>(outLocalTensor);
+        vecInQueue.FreeTensor(inLocalTensor);
+        vecSharedQueue.FreeTensor(sharedTmpBuf);
+    }
+
+    __aicore__ inline void CopyOut()
+    {
+        LocalTensor<qType> outLocalTensor = vecOutQueue.DeQue<qType>();
+
+        if (args.attr == ALREADY_ALIGNED) {
+            DataCopy(srcGloblePtr[offset], outLocalTensor, totalSize);
+        } else if (args.attr == SPECIAL_CASE) {
+            uint32_t thisLen = height * args.keyDim1 * sizeof(qType);
+            if ((thisLen % ALIGN_32) != 0) {
+                DataCopyExtParams dataCopyParamTail {1, thisLen, 0, 0, 0};
+                DataCopyPad(srcGloblePtr[offset], outLocalTensor, dataCopyParamTail);
+            } else {
+                DataCopy(srcGloblePtr[offset], outLocalTensor, height * args.keyDim1);
+            }
+        } else {
+            DataCopyPad(srcGloblePtr[offset], outLocalTensor, copyParams);
+        }
+        vecOutQueue.FreeTensor(outLocalTensor);
+    }
+
+private:
+    NormalizeArgs args;
+    TQue<QuePosition::VECIN, 1> vecInQueue;
+    TQue<QuePosition::VECOUT, 1> vecOutQueue;
+    TQue<QuePosition::VECIN, 1> vecSharedQueue;
+
+    GlobalTensor<qType> srcGloblePtr;
+    GlobalTensor<qType> maskGloblePtr;
+    uint32_t height = 0;
+    int offset = 0;
+    int usedRowCount = 0;
+    uint32_t totalSize = 0;
+    struct DataCopyExtParams copyParams;
+    struct DataCopyPadExtParams<qType> padParams;
+};
+#endif
\ No newline at end of file
diff --git a/cust_op/attention_fusion/op_kernel/q_k_bmm_compute.h b/cust_op/attention_fusion/op_kernel/q_k_bmm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fb2d3d14e015138b14a94c7d6faa817dc329e43
--- /dev/null
+++ b/cust_op/attention_fusion/op_kernel/q_k_bmm_compute.h
@@ -0,0 +1,73 @@
+#ifndef QK_BMMM_COMPUTE__H
+#define QK_BMMM_COMPUTE__H
+#include <cstdint>
+#include "attention_fusion_kernel.h"
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+
+using namespace AscendC;
+
+struct QKBmmArgs {
+    GM_ADDR query;
+    GM_ADDR key;
+    GM_ADDR out;
+    
+    int dimM;
+    int dimN;
+    int dimK;
+
+    int batchOffset;
+    int batchLen;
+};
+
+struct QKBmmPipeArgs {
+    TPipe* pipe;
+};
+
+template<typename qType, typename kType>
+class QKBmmCompute {
+public:
+    __aicore__ inline QKBmmCompute() {}
+
+    __aicore__ inline void Init(QKBmmArgs qKBmmArgs, QKBmmPipeArgs pipeArgs)
+    {
+        this->qKBmmArgs = qKBmmArgs;
+
+        // kernel batch offset
+        qGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ qType*>(qKBmmArgs.query),
+                                qKBmmArgs.batchLen * qKBmmArgs.dimM * qKBmmArgs.dimK);
+        qGlobal = qGlobal[qKBmmArgs.batchOffset * qKBmmArgs.dimM * qKBmmArgs.dimK];
+
+        kGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ kType*>(qKBmmArgs.key),
+                                qKBmmArgs.batchLen * qKBmmArgs.dimN * qKBmmArgs.dimK);
+        kGlobal = kGlobal[qKBmmArgs.batchOffset * qKBmmArgs.dimN * qKBmmArgs.dimK];
+
+        outGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ kType*>(qKBmmArgs.out),
+                                qKBmmArgs.batchLen * qKBmmArgs.dimM * qKBmmArgs.dimN);
+        outGlobal = outGlobal[qKBmmArgs.batchOffset * qKBmmArgs.dimM * qKBmmArgs.dimN];
+    }
+
+    __aicore__ inline void Process()
+    {
+        for (int thisBatch = 0 ; thisBatch < qKBmmArgs.batchLen; thisBatch++) {
+            mm.SetTensorA(qGlobal[thisBatch * qKBmmArgs.dimM * qKBmmArgs.dimK]);
+            mm.SetTensorB(kGlobal[thisBatch * qKBmmArgs.dimN * qKBmmArgs.dimK], true);
+
+            mm.IterateAll(outGlobal[thisBatch * qKBmmArgs.dimM * qKBmmArgs.dimN], 0, false);
+        }
+        mm.End();
+    }
+    
+    matmul::Matmul<
+        matmul::MatmulType<matmul::TPosition::GM, CubeFormat::ND, qType, false>,
+        matmul::MatmulType <matmul::TPosition::GM, CubeFormat::ND, kType, true>,
+        matmul::MatmulType<matmul::TPosition::GM, CubeFormat::ND, qType, false>,
+        matmul::MatmulType<matmul::TPosition::GM, CubeFormat::ND, qType>
+        > mm;
+private:
+    QKBmmArgs qKBmmArgs;
+    GlobalTensor<qType> qGlobal;
+    GlobalTensor<kType> kGlobal;
+    GlobalTensor<kType> outGlobal;
+};
+#endif
\ No newline at end of file
diff --git a/cust_op/attention_fusion/run.sh b/cust_op/attention_fusion/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9f2acd0217cdb96e3b3b08b2ace47a9075674292
--- /dev/null
+++ b/cust_op/attention_fusion/run.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+# 查找msopgen的路径，加入到环境变量PATH中
+msopgen_path=$(find /usr/local/Ascend/ -name msopgen | grep bin)
+parent_dir=$(dirname "$msopgen_path")
+export PATH=$parent_dir:$PATH
+
+# 利用msopgen生成可编译文件
+rm -rf ./attention_fusion
+/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/bin/msopgen gen -i attention_fusion.json -f tf -c ai_core-Ascend910B1 -lan cpp -out ./attention_fusion -m 0 -op AttentionFusion
+rm -rf attention_fusion/op_kernel
+rm -rf attention_fusion/host
+cp -rf op_kernel attention_fusion/
+cp -rf op_host attention_fusion/
+
+cd attention_fusion
+
+# 判断当前目录下是否存在CMakePresets.json文件
+if [ ! -f "CMakePresets.json" ]; then
+  echo "ERROR, CMakePresets.json file not exist."
+  exit 1
+fi
+
+# 禁止生成CRC校验和
+sed -i 's/--nomd5/--nomd5 --nocrc/g' ./cmake/makeself.cmake
+
+# 修改cann安装路径
+sed -i 's:"/usr/local/Ascend/latest":"/usr/local/Ascend/ascend-toolkit/latest":g' CMakePresets.json
+# 修改vendor_name 防止覆盖之前vendor_name为customize的算子;
+# vendor_name需要和aclnn中的CMakeLists.txt中的CUST_PKG_PATH值同步，不同步aclnn会调用失败;
+# vendor_name字段值不能包含customize；包含会导致多算子部署场景CANN的vendors路径下config.ini文件内容截取错误
+sed -i 's:"customize":"attention_fusion":g' CMakePresets.json
+
+bash build.sh
+
+# # 安装编译成功的算子包
+bash ./build_out/custom_opp*.run
+cd ../aclnn_attention_fusion
+bash run.sh
+# cd ..
+# rm -rf ./attenion_fusion
\ No newline at end of file