From 76989aea8b816c82268c22e99dd53be40ab68f49 Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Tue, 4 Jun 2024 16:50:18 +0800
Subject: [PATCH 01/11] add tf op

---
 attention_fusion/build_ops.sh          | 23 +++++++
 attention_fusion/test/test.py          | 93 ++++++++++++++++++++++++++
 attention_fusion/tf_ops/CMakeLists.txt | 34 ++++++++++
 attention_fusion/tf_ops/atten_ops.cpp  | 81 ++++++++++++++++++++++
 attention_fusion/tf_ops/atten_ops.h    | 19 ++++++
 5 files changed, 250 insertions(+)
 create mode 100644 attention_fusion/build_ops.sh
 create mode 100644 attention_fusion/test/test.py
 create mode 100644 attention_fusion/tf_ops/CMakeLists.txt
 create mode 100644 attention_fusion/tf_ops/atten_ops.cpp
 create mode 100644 attention_fusion/tf_ops/atten_ops.h

diff --git a/attention_fusion/build_ops.sh b/attention_fusion/build_ops.sh
new file mode 100644
index 0000000..d175969
--- /dev/null
+++ b/attention_fusion/build_ops.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+mkdir build
+cd build
+cmake ../tf_ops
+make -j4
+
+cd test
+python3 test.py 
diff --git a/attention_fusion/test/test.py b/attention_fusion/test/test.py
new file mode 100644
index 0000000..e24e49f
--- /dev/null
+++ b/attention_fusion/test/test.py
@@ -0,0 +1,93 @@
+from mpi4py import MPI
+import os
+import numpy as np
+from math import sqrt
+
+import tensorflow as tf
+tf.compat.v1.disable_eager_execution()
+tfOpLib = tf.load_op_library("../build/libattention_ops.so")
+
+import npu_device
+from npu_device.compat.v1.npu_init import *
+
+loss = 1e-3
+minimum = 10e-10
+
+def softmax(src):
+    #基于last轴进行rowmax(按行取最大值)处理
+    max = np.max(src, axis=-1, keepdims=True)
+    sub = src - max
+    exp = np.exp(sub)
+    #基于last轴进行rowsum(按行求和)处理
+    sum = np.sum(exp, axis=-1, keepdims=True)
+    dst = exp / sum
+    return dst
+
+def gloden_atten_fusion(query, key, value, atten_mask):
+    qk = np.matmul(query, key.transpose(0, 2, 1))
+        
+    # @jiangli
+    print("attn_dim: ", query.shape[2])
+    attnDimSqrt = 1 / sqrt(query.shape[2])
+    attnWeight = np.multiply(qk, attnDimSqrt)
+    addMask = np.add(attnWeight, atten_mask)
+    qk_div = softmax(addMask)
+
+    out = np.matmul(qk_div, value)
+    return out, qk_div
+
+def verify_result(real_result, golden):
+    print(real_result[:32])
+    print(golden[:32])
+    result = np.abs(real_result - golden)
+    deno = np.maximum(np.abs(real_result), np.abs(golden))
+    result_atol = np.less_equal(result, loss)
+    result_rtol = np.less_equal(result / np.add(deno, minimum), loss)
+    if not result_rtol.all() and not result_atol.all():
+        if np.sum(result_rtol == False) > real_result.size * loss and np.sum(result_atol == False) > real_result.size * loss:
+            print("[ERROR] result error")
+            return False
+    print("test pass")
+    return True
+
+npu_device.compat.enable_v1()
+npu_init = npu_ops.initialize_system()
+npu_shutdown = npu_ops.shutdown_system()
+config = tf.compat.v1.ConfigProto()
+custom_op = config.graph_options.rewrite_options.custom_optmizers.add()
+custom_op.name = "NpuOptimizer"
+config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
+config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
+
+#测试用例
+dim0 = 1024
+dim1 = 1000
+dim2 = 80
+dim3 = 50
+
+query = tf.random_uniform([dim0, dim1, dim2], maxval = 10, dtype=tf.float32)
+key = tf.random_uniform([dim0, dim3, dim2], maxval = 10, dtype=tf.float32)
+value = tf.random_uniform([dim0, dim3, dim2], maxval = 10, dtype=tf.float32)
+mask = tf.random_uniform([dim0, dim1, dim3], maxval = 1, dtype=tf.float32)
+
+golden_atten_score, gold_softmax_out = gloden_atten_fusion(query, key, value, mask)
+
+ret = tfOpLib.attention_fusion(query=query, key=key,
+                                                   value=value, attnMask=mask)
+init = tf.compat.v1.global_varibles_initializer()
+
+with tf.compat.v1.Session(config=config) as sess:
+    sess.run(init)
+    attenScore, softmaxOut = sess.run(ret)
+    print("============ attention fusion =============")
+    print("attenScore: ")
+    verify_result(attenScore, golden_atten_score)
+    print("softmaxOut: ")
+    verify_result(softmaxOut, gold_softmax_out)
+
+    
+
+
+
+
+
diff --git a/attention_fusion/tf_ops/CMakeLists.txt b/attention_fusion/tf_ops/CMakeLists.txt
new file mode 100644
index 0000000..291b68a
--- /dev/null
+++ b/attention_fusion/tf_ops/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+cmake_minimum_required(VERSION 3.20)
+set(CMAKE_CXX_STANDARD 14)
+
+include_directories(/usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow/inlcude)
+link_directories(/usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow/)
+
+file(GLOB_RECURSE TF_OPS ./*.cpp)
+add_library(attention_ops SHARED ${TF_OPS})
+
+message("TF VERSION" ${TF_VERSION})
+if(${TF_VERSION} EQUAL 1)
+    target_link_libraries(attention_ops /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow_core/libtensorflow_framework.so.1)
+    target_include_directories(attention_ops PUBLIC /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow_core/inlcude)
+else()
+    target_link_libraries(attention_ops /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow/libtensorflow_framework.so.2)
+    target_include_directories(attention_ops PUBLIC /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow/inlcude)
+endif()
+
+install(TARGET attention_ops LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX})
\ No newline at end of file
diff --git a/attention_fusion/tf_ops/atten_ops.cpp b/attention_fusion/tf_ops/atten_ops.cpp
new file mode 100644
index 0000000..99a9513
--- /dev/null
+++ b/attention_fusion/tf_ops/atten_ops.cpp
@@ -0,0 +1,81 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <atomic>
+#include <map>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;
+using shape_inference::inferenceContext;
+using shape_inference::ShapeHandle;
+
+using namespace std;
+using namespace chrono;
+
+using OpKernelConstructionPtr = OpKernelConstructionPtr*;
+using OpKernelContextPtr = OpKernelContextPtr*;
+using inferenceContextPtr = ::tensorflow::shape_inference::inferenceContext*;
+
+namespace {
+    class CustOps : public OpKernel {
+        public:
+            explicit CustOps(OpKernelConstructionPtr context) : OpKernel(context)
+            {
+            }
+
+            void Compute(OpKernelContextPtr context) override
+            {
+                std::cout << "Cust Ops not installed!!" << std::endl;
+            }
+
+            ~CustOps() override = default;
+    }
+}
+
+namespace tensorflow {
+    REGISTER_OP("attention_fusion")
+    .Input("query: float")
+    .Input("key: float")
+    .Input("value: float")
+    .Input("attnMask: float")
+    .Output("attenScore: float")
+    .Output("softmaxOut: float")
+    .SetIsStateful()
+    .SetShapeFn([](::tensorflow::shape_inference::inferenceContext *c)) {
+        ShapeHandle query_shape;
+        ShapeHandle key_shape;
+        ShapeHandle value_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &query_shape));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &key_shape));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &value_shape));
+
+        tensorflow::shape_inference::DimensionHandle queryDim0 = c->Dim(query_shape, 0);
+        tensorflow::shape_inference::DimensionHandle queryDim1 = c->Dim(query_shape, 1);
+        tensorflow::shape_inference::DimensionHandle keyDim1 = c->Dim(key_shape, 1);
+        tensorflow::shape_inference::DimensionHandle valueDim2 = c->Dim(value_shape, 2);
+        int64_t shape0 = c->Value(queryDim0);
+        int64_t shape1 = c->Value(queryDim1);
+        int64_t shape2 = c->Value(keyDim1);
+        int64_t shape3 = c->Value(valueDim2);
+
+        c->set_output(0, c->MakeShape({shape0, shape1, shape3}));
+        c->set_output(1, c->MakeShape({shape0, shape1, shape2}));
+        return Status::OK();
+    }
+    REGISTER_KERNEL_BUILDER(Name("FusedAttention").Device(Device_CPU), CustOps)
+}
\ No newline at end of file
diff --git a/attention_fusion/tf_ops/atten_ops.h b/attention_fusion/tf_ops/atten_ops.h
new file mode 100644
index 0000000..6d022f6
--- /dev/null
+++ b/attention_fusion/tf_ops/atten_ops.h
@@ -0,0 +1,19 @@
+/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+        limitations under the License.
+==============================================================================*/
+
+#ifndef ATTEN_OPS_H
+#define ATTEN_OPS_H
+
+#endif // ATTEN_OPS_H
\ No newline at end of file
-- 
Gitee


From 3c62ba869ecdef81a1404894101eb036d99d7245 Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Wed, 5 Jun 2024 18:05:12 +0800
Subject: [PATCH 02/11] update

---
 attention_fusion/CMakeLists.txt        |  2 ++
 attention_fusion/build_ops.sh          |  8 ++++++--
 attention_fusion/test/test.py          | 19 +++++++++++--------
 attention_fusion/tf_ops/CMakeLists.txt | 20 +++++++++++---------
 attention_fusion/tf_ops/atten_ops.cpp  | 26 +++++++++++++-------------
 5 files changed, 43 insertions(+), 32 deletions(-)
 create mode 100644 attention_fusion/CMakeLists.txt

diff --git a/attention_fusion/CMakeLists.txt b/attention_fusion/CMakeLists.txt
new file mode 100644
index 0000000..f9184b8
--- /dev/null
+++ b/attention_fusion/CMakeLists.txt
@@ -0,0 +1,2 @@
+cmake_minimum_required(VERSION 3.20)
+add_subdirectory(tf_ops)
\ No newline at end of file
diff --git a/attention_fusion/build_ops.sh b/attention_fusion/build_ops.sh
index d175969..f658ea3 100644
--- a/attention_fusion/build_ops.sh
+++ b/attention_fusion/build_ops.sh
@@ -14,10 +14,14 @@
 # limitations under the License.
 # ==============================================================================
 
+if [ -d build ]; then
+    rm -rf build
+fi
+
 mkdir build
 cd build
-cmake ../tf_ops
+cmake ..
 make -j4
 
-cd test
+cd ../test
 python3 test.py 
diff --git a/attention_fusion/test/test.py b/attention_fusion/test/test.py
index e24e49f..0d4ef7e 100644
--- a/attention_fusion/test/test.py
+++ b/attention_fusion/test/test.py
@@ -3,10 +3,14 @@ import os
 import numpy as np
 from math import sqrt
 
+os.environ["DEVICE_ID"] = str(0)
+os.environ["ASCEND_DEVICE_ID"] = str(0)
+os.environ["JOB_ID"] = "10086"
+
 import tensorflow as tf
 tf.compat.v1.disable_eager_execution()
-tfOpLib = tf.load_op_library("../build/libattention_ops.so")
-
+tfOpLib = tf.load_op_library("../build/tf_ops/libattention_ops.so")
+import sys
 import npu_device
 from npu_device.compat.v1.npu_init import *
 
@@ -54,7 +58,7 @@ npu_device.compat.enable_v1()
 npu_init = npu_ops.initialize_system()
 npu_shutdown = npu_ops.shutdown_system()
 config = tf.compat.v1.ConfigProto()
-custom_op = config.graph_options.rewrite_options.custom_optmizers.add()
+custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
 custom_op.name = "NpuOptimizer"
 config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
 config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
@@ -72,18 +76,17 @@ mask = tf.random_uniform([dim0, dim1, dim3], maxval = 1, dtype=tf.float32)
 
 golden_atten_score, gold_softmax_out = gloden_atten_fusion(query, key, value, mask)
 
-ret = tfOpLib.attention_fusion(query=query, key=key,
-                                                   value=value, attnMask=mask)
-init = tf.compat.v1.global_varibles_initializer()
+ret = tfOpLib.attention_fusion(query=query, key=key,value=value, atten_mask=mask)
+init = tf.compat.v1.global_variables_initializer()
 
 with tf.compat.v1.Session(config=config) as sess:
     sess.run(init)
     attenScore, softmaxOut = sess.run(ret)
     print("============ attention fusion =============")
     print("attenScore: ")
-    verify_result(attenScore, golden_atten_score)
+    # verify_result(attenScore, golden_atten_score)
     print("softmaxOut: ")
-    verify_result(softmaxOut, gold_softmax_out)
+    # verify_result(softmaxOut, gold_softmax_out)
 
     
diff --git a/attention_fusion/tf_ops/CMakeLists.txt b/attention_fusion/tf_ops/CMakeLists.txt
index 291b68a..0d723f9 100644
--- a/attention_fusion/tf_ops/CMakeLists.txt
+++ b/attention_fusion/tf_ops/CMakeLists.txt
@@ -15,20 +15,22 @@
 
 cmake_minimum_required(VERSION 3.20)
 set(CMAKE_CXX_STANDARD 14)
+project(attention)
 
-include_directories(/usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow/inlcude)
+add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+include_directories(/usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow/include)
 link_directories(/usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow/)
 
 file(GLOB_RECURSE TF_OPS ./*.cpp)
 add_library(attention_ops SHARED ${TF_OPS})
 
-message("TF VERSION" ${TF_VERSION})
-if(${TF_VERSION} EQUAL 1)
-    target_link_libraries(attention_ops /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow_core/libtensorflow_framework.so.1)
-    target_include_directories(attention_ops PUBLIC /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow_core/inlcude)
-else()
+# message("TF VERSION" ${TF_VERSION})
+# if(${TF_VERSION} EQUAL 1)
+#     target_link_libraries(attention_ops /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow_core/libtensorflow_framework.so.1)
+#     target_include_directories(attention_ops PUBLIC /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow_core/inlcude)
+# else()
     target_link_libraries(attention_ops /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow/libtensorflow_framework.so.2)
-    target_include_directories(attention_ops PUBLIC /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow/inlcude)
-endif()
+    target_include_directories(attention_ops PUBLIC /usr/local/python3.7.5/lib/python3.7/site-packages/tensorflow/include)
+# endif()
 
-install(TARGET attention_ops LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX})
\ No newline at end of file
+install(TARGETS attention_ops LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX})
\ No newline at end of file
diff --git a/attention_fusion/tf_ops/atten_ops.cpp b/attention_fusion/tf_ops/atten_ops.cpp
index 99a9513..eeabcb7 100644
--- a/attention_fusion/tf_ops/atten_ops.cpp
+++ b/attention_fusion/tf_ops/atten_ops.cpp
@@ -21,15 +21,15 @@ See the License for the specific language governing permissions and
 #include "tensorflow/core/framework/op_kernel.h"
 
 using namespace tensorflow;
-using shape_inference::inferenceContext;
+using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
 using namespace std;
 using namespace chrono;
 
-using OpKernelConstructionPtr = OpKernelConstructionPtr*;
-using OpKernelContextPtr = OpKernelContextPtr*;
-using inferenceContextPtr = ::tensorflow::shape_inference::inferenceContext*;
+using OpKernelConstructionPtr = OpKernelConstruction*;
+using OpKernelContextPtr = OpKernelContext*;
+using InferenceContextPtr = ::tensorflow::shape_inference::InferenceContext*;
 
 namespace {
     class CustOps : public OpKernel {
@@ -44,7 +44,7 @@ namespace {
             }
 
             ~CustOps() override = default;
-    }
+    };
 }
 
 namespace tensorflow {
@@ -52,17 +52,17 @@ namespace tensorflow {
     .Input("query: float")
     .Input("key: float")
     .Input("value: float")
-    .Input("attnMask: float")
-    .Output("attenScore: float")
-    .Output("softmaxOut: float")
+    .Input("atten_mask: float")
+    .Output("atten_score: float")
+    .Output("softmax_out: float")
     .SetIsStateful()
-    .SetShapeFn([](::tensorflow::shape_inference::inferenceContext *c)) {
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
         ShapeHandle query_shape;
         ShapeHandle key_shape;
         ShapeHandle value_shape;
         TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &query_shape));
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &key_shape));
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &value_shape));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &key_shape));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 3, &value_shape));
 
         tensorflow::shape_inference::DimensionHandle queryDim0 = c->Dim(query_shape, 0);
         tensorflow::shape_inference::DimensionHandle queryDim1 = c->Dim(query_shape, 1);
@@ -76,6 +76,6 @@ namespace tensorflow {
         c->set_output(0, c->MakeShape({shape0, shape1, shape3}));
         c->set_output(1, c->MakeShape({shape0, shape1, shape2}));
         return Status::OK();
-    }
-    REGISTER_KERNEL_BUILDER(Name("FusedAttention").Device(Device_CPU), CustOps)
+    });
+    REGISTER_KERNEL_BUILDER(Name("FusedAttention").Device(DEVICE_CPU), CustOps)
 }
\ No newline at end of file
-- 
Gitee


From 56ffc7989e0fb57b68727acfacd3a2ef871d9b14 Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Thu, 6 Jun 2024 15:03:09 +0800
Subject: [PATCH 03/11] update

---
 .../op_host/attention_fusion_tiling.h         | 20 +++++++++----------
 .../op_kernel/attention_fusion.cpp            |  8 ++++++--
 .../op_kernel/attention_fusion_kernel.h       |  6 +++---
 .../op_kernel/normalize_compute.h             |  2 +-
 attention_fusion/op_kernel/q_k_bmm_compute.h  |  2 +-
 5 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/attention_fusion/op_host/attention_fusion_tiling.h b/attention_fusion/op_host/attention_fusion_tiling.h
index 5125d89..6628420 100644
--- a/attention_fusion/op_host/attention_fusion_tiling.h
+++ b/attention_fusion/op_host/attention_fusion_tiling.h
@@ -4,16 +4,16 @@ namespace optiling {
     BEGIN_TILING_DATA_DEF(AttentionFusionTilingData)
         TILING_DATA_FIELD_DEF(uint8_t, normalizeAttr);
         TILING_DATA_FIELD_DEF(float, attnDim);
-        TILING_DATA_FIELD_DEF(int, queryDim1);
-        TILING_DATA_FIELD_DEF(int, queryDim2);
-        TILING_DATA_FIELD_DEF(int, keyDim1);
-        TILING_DATA_FIELD_DEF(int, keyDim2);
-        TILING_DATA_FIELD_DEF(int, valueDim1);
-        TILING_DATA_FIELD_DEF(int, valueDim2);
-        TILING_DATA_FIELD_DEF(int, batchNum);
-        TILING_DATA_FIELD_DEF(int, normalizeLoop);
-        TILING_DATA_FIELD_DEF(int, normalizeRow);
-        TILING_DATA_FIELD_DEF(int, normalizeColumn);
+        TILING_DATA_FIELD_DEF(int32_t, queryDim1);
+        TILING_DATA_FIELD_DEF(int32_t, queryDim2);
+        TILING_DATA_FIELD_DEF(int32_t, keyDim1);
+        TILING_DATA_FIELD_DEF(int32_t, keyDim2);
+        TILING_DATA_FIELD_DEF(int32_t, valueDim1);
+        TILING_DATA_FIELD_DEF(int32_t, valueDim2);
+        TILING_DATA_FIELD_DEF(int32_t, batchNum);
+        TILING_DATA_FIELD_DEF(int32_t, normalizeLoop);
+        TILING_DATA_FIELD_DEF(int32_t, normalizeRow);
+        TILING_DATA_FIELD_DEF(int32_t, normalizeColumn);
         TILING_DATA_FIELD_DEF(float, normalizeSqrt);
         TILING_DATA_FIELD_DEF(uint64_t, maxSharedTmpBuf);
         TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, qkMatmulTiling);
diff --git a/attention_fusion/op_kernel/attention_fusion.cpp b/attention_fusion/op_kernel/attention_fusion.cpp
index 32267a9..efdbc78 100644
--- a/attention_fusion/op_kernel/attention_fusion.cpp
+++ b/attention_fusion/op_kernel/attention_fusion.cpp
@@ -6,13 +6,17 @@ using namespace AscendC;
 // call of kernel function
 extern "C" __global__ __aicore__ void attention_fusion(GM_ADDR query, GM_ADDR key, GM_ADDR value, GM_ADDR attnMask, GM_ADDR attenScore, GM_ADDR softmaxOut, GM_ADDR workspace, GM_ADDR tiling) {
     GET_TILING_DATA(tiling_data, tiling);
+
+    const TCubeTiling *qkMatmulTiling = &tiling_data.qkMatmulTiling;
+    const TCubeTiling *kvMatmulTiling = &tiling_data.kvMatmulTiling;
+    const SoftMaxTiling *softMaxTilingData = &tiling_data.softMaxTilingData;
     // TODO: user kernel impl
     AttentionFusionArgs args {
         query, key, value, attnMask, attenScore, softmaxOut, tiling_data.normalizeAttr, tiling_data.queryDim1,
         tiling_data.queryDim2, tiling_data.keyDim1, tiling_data.keyDim2, tiling_data.valueDim1, tiling_data.valueDim2,
         tiling_data.batchNum, tiling_data.normalizeLoop, tiling_data.normalizeRow, tiling_data.normalizeColumn,
-        tiling_data.normalizeSqrt, tiling_data.maxSharedTmpBuf, &tiling_data.qkMatmulTiling,
-        &tiling_data.kvMatmulTiling, &tiling_data.softMaxTilingData
+        tiling_data.normalizeSqrt, tiling_data.maxSharedTmpBuf, qkMatmulTiling,
+        kvMatmulTiling, softMaxTilingData
     };
 
     AttentionFusionKernel<float, float, float> kernel;
diff --git a/attention_fusion/op_kernel/attention_fusion_kernel.h b/attention_fusion/op_kernel/attention_fusion_kernel.h
index 27a3dc4..298cb9a 100644
--- a/attention_fusion/op_kernel/attention_fusion_kernel.h
+++ b/attention_fusion/op_kernel/attention_fusion_kernel.h
@@ -32,9 +32,9 @@ struct AttentionFusionArgs {
     float normalizeSqrt;
     uint64_t maxSharedTmpBuf;
 
-    TCubeTiling* qkMatmulTiling;
-    TCubeTiling* kvMatmulTiling;
-    SoftMaxTiling* softMaxTilingData;
+    const TCubeTiling* qkMatmulTiling;
+    const TCubeTiling* kvMatmulTiling;
+    const SoftMaxTiling* softMaxTilingData;
 };
 
 struct AttentionFusionPipe {
diff --git a/attention_fusion/op_kernel/normalize_compute.h b/attention_fusion/op_kernel/normalize_compute.h
index e79f19e..122891e 100644
--- a/attention_fusion/op_kernel/normalize_compute.h
+++ b/attention_fusion/op_kernel/normalize_compute.h
@@ -18,7 +18,7 @@ struct NormalizeArgs {
     float normalizeSqrt;
     uint64_t maxSharedTmpBuf;
 
-    SoftMaxTiling* tiling;
+    const SoftMaxTiling* tiling;
 };
 
 template<typename qType>
diff --git a/attention_fusion/op_kernel/q_k_bmm_compute.h b/attention_fusion/op_kernel/q_k_bmm_compute.h
index ee261c3..1a714d8 100644
--- a/attention_fusion/op_kernel/q_k_bmm_compute.h
+++ b/attention_fusion/op_kernel/q_k_bmm_compute.h
@@ -19,7 +19,7 @@ struct QKBmmArgs {
     int batchOffset;
     int batchLen;
     
-    TCubeTiling* qkMatmulTiling;
+    const TCubeTiling* qkMatmulTiling;
 };
 
 struct QKBmmPipeArgs {
-- 
Gitee


From 0df5702a10fc35f73a7eb85d46ccd9b4f65a31c9 Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Thu, 6 Jun 2024 15:11:43 +0800
Subject: [PATCH 04/11] fix a typo

---
 attention_fusion/tf_ops/atten_ops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/attention_fusion/tf_ops/atten_ops.cpp b/attention_fusion/tf_ops/atten_ops.cpp
index eeabcb7..36d8b0d 100644
--- a/attention_fusion/tf_ops/atten_ops.cpp
+++ b/attention_fusion/tf_ops/atten_ops.cpp
@@ -48,7 +48,7 @@ namespace {
 }
 
 namespace tensorflow {
-    REGISTER_OP("attention_fusion")
+    REGISTER_OP("AttentionFusion")
     .Input("query: float")
     .Input("key: float")
     .Input("value: float")
-- 
Gitee


From 2233292dbaeaf8a0c0e432ff54c636919f006c30 Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Thu, 6 Jun 2024 15:34:42 +0800
Subject: [PATCH 05/11] update

---
 .../aclnn_attention_fusion/inc/operator_desc.h       |  1 +
 attention_fusion/aclnn_attention_fusion/src/main.cpp |  1 +
 .../aclnn_attention_fusion/src/op_runner.cpp         |  7 ++++---
 attention_fusion/attention_fusion.json               | 10 +++++++++-
 attention_fusion/op_host/attention_fusion.cpp        | 12 +++++++++++-
 attention_fusion/test/test.py                        |  2 +-
 attention_fusion/tf_ops/atten_ops.cpp                |  1 +
 7 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/attention_fusion/aclnn_attention_fusion/inc/operator_desc.h b/attention_fusion/aclnn_attention_fusion/inc/operator_desc.h
index 4cbdf07..225e848 100644
--- a/attention_fusion/aclnn_attention_fusion/inc/operator_desc.h
+++ b/attention_fusion/aclnn_attention_fusion/inc/operator_desc.h
@@ -52,6 +52,7 @@ struct OperatorDesc {
     std::string opType;
     std::vector<aclTensorDesc *> inputDesc;
     std::vector<aclTensorDesc *> outputDesc;
+    int32_t maskOnOptional;
 };
 
 #endif // OPERATOR_DESC_H
diff --git a/attention_fusion/aclnn_attention_fusion/src/main.cpp b/attention_fusion/aclnn_attention_fusion/src/main.cpp
index 661aade..76a8f61 100644
--- a/attention_fusion/aclnn_attention_fusion/src/main.cpp
+++ b/attention_fusion/aclnn_attention_fusion/src/main.cpp
@@ -38,6 +38,7 @@ OperatorDesc CreateOpDesc()
     aclDataType dataTypeSoftmaxOut = ACL_FLOAT;
     aclFormat format = ACL_FORMAT_ND;
     OperatorDesc opDesc;
+    opDesc.maskOnOptional = 1;
     opDesc.AddInputTensorDesc(dataTypeQuery, shapeQuery.size(), shapeQuery.data(), format);
     opDesc.AddInputTensorDesc(dataTypeKey, shapeKey.size(), shapeKey.data(), format);
     opDesc.AddInputTensorDesc(dataTypeValue, shapeValue.size(), shapeValue.data(), format);
diff --git a/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp b/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp
index 9806562..a47a5ca 100644
--- a/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp
+++ b/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp
@@ -307,8 +307,9 @@ bool OpRunner::RunOp()
 
     size_t workspaceSize = 0;
 	aclOpExecutor *handle = nullptr;
-	auto ret = aclnnAttentionFusionGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], outputTensor_[0], outputTensor_[1],
-                                              &workspaceSize, &handle);
+	auto ret = aclnnAttentionFusionGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3],
+                                                    opDesc_->maskOnOptional, outputTensor_[0], outputTensor_[1],
+                                                    &workspaceSize, &handle);
     if (ret != ACL_SUCCESS) {
         (void)aclrtDestroyStream(stream);
         ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast<int32_t>(ret));
@@ -341,7 +342,7 @@ bool OpRunner::RunOp()
 
     auto beforeTime = std::chrono::steady_clock::now();
     for (int i = 0; i<100; i++) {
-        ret = aclnnAttentionFusionGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], outputTensor_[0], outputTensor_[1],
+        ret = aclnnAttentionFusionGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], int(1), outputTensor_[0], outputTensor_[1],
                                               &workspaceSize, &handle);
         ret = aclnnAttentionFusion(workspace, workspaceSize, handle, stream);
     }
diff --git a/attention_fusion/attention_fusion.json b/attention_fusion/attention_fusion.json
index 7a32ab4..4a06e1b 100644
--- a/attention_fusion/attention_fusion.json
+++ b/attention_fusion/attention_fusion.json
@@ -65,6 +65,14 @@
                     "float"
                 ]
             }
-        ]
+        ],
+        "attr": [
+            {
+              "name": "mask_on",
+			  "param_type": "optional",
+              "type": "int",
+              "default_value": 0
+            }
+          ]
     }
 ]
\ No newline at end of file
diff --git a/attention_fusion/op_host/attention_fusion.cpp b/attention_fusion/op_host/attention_fusion.cpp
index e138f2c..71bd8e9 100644
--- a/attention_fusion/op_host/attention_fusion.cpp
+++ b/attention_fusion/op_host/attention_fusion.cpp
@@ -140,16 +140,25 @@ static ge::graphStatus InferShape(gert::InferShapeContext* context)
     gert::Shape* attnScoreShape = context->GetOutputShape(0);
     gert::Shape* softmaxOutShape = context->GetOutputShape(1);
 
+    attnScoreShape->SetDimNum(3);
     attnScoreShape->SetDim(0, qShape->GetDim(0));
     attnScoreShape->SetDim(1, qShape->GetDim(1));
     attnScoreShape->SetDim(2, vShape->GetDim(2));
 
+    softmaxOutShape->SetDimNum(3);
     softmaxOutShape->SetDim(0, qShape->GetDim(0));
     softmaxOutShape->SetDim(1, qShape->GetDim(1));
     softmaxOutShape->SetDim(2, kShape->GetDim(1));
 
     return GRAPH_SUCCESS;
 }
+
+static ge::graphStatus InferDtype(gert::InferShapeContext* context)
+{
+    context->SetOutputDataType(0, context->GetInputDataType(0));
+    context->SetOutputDataType(1, context->GetInputDataType(1));
+    return GRAPH_SUCCESS;
+}
 }
 
 
@@ -188,8 +197,9 @@ public:
             .DataType({ge::DT_FLOAT})
             .Format({ge::FORMAT_ND})
             .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Attr("mask_on").Int();
 
-        this->SetInferShape(ge::InferShape);
+        this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDtype);
 
         this->AICore()
             .SetTiling(optiling::TilingFunc);
diff --git a/attention_fusion/test/test.py b/attention_fusion/test/test.py
index 0d4ef7e..ab0137a 100644
--- a/attention_fusion/test/test.py
+++ b/attention_fusion/test/test.py
@@ -74,7 +74,7 @@ key = tf.random_uniform([dim0, dim3, dim2], maxval = 10, dtype=tf.float32)
 value = tf.random_uniform([dim0, dim3, dim2], maxval = 10, dtype=tf.float32)
 mask = tf.random_uniform([dim0, dim1, dim3], maxval = 1, dtype=tf.float32)
 
-golden_atten_score, gold_softmax_out = gloden_atten_fusion(query, key, value, mask)
+# golden_atten_score, gold_softmax_out = gloden_atten_fusion(query, key, value, mask)
 
 ret = tfOpLib.attention_fusion(query=query, key=key,value=value, atten_mask=mask)
 init = tf.compat.v1.global_variables_initializer()
diff --git a/attention_fusion/tf_ops/atten_ops.cpp b/attention_fusion/tf_ops/atten_ops.cpp
index 36d8b0d..c157c12 100644
--- a/attention_fusion/tf_ops/atten_ops.cpp
+++ b/attention_fusion/tf_ops/atten_ops.cpp
@@ -55,6 +55,7 @@ namespace tensorflow {
     .Input("atten_mask: float")
     .Output("atten_score: float")
     .Output("softmax_out: float")
+    .Attr("mask_on: int")
     .SetIsStateful()
     .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
         ShapeHandle query_shape;
-- 
Gitee


From 252451b41169788a701df070dcc3a4a57f98a4d2 Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Thu, 6 Jun 2024 16:17:50 +0800
Subject: [PATCH 06/11] update

---
 attention_fusion/attention_fusion.json        |  2 +-
 attention_fusion/op_host/attention_fusion.cpp |  1 -
 .../op_kernel/attention_fusion_kernel.h       | 20 ++++----
 attention_fusion/test/test.py                 | 50 ++-----------------
 attention_fusion/tf_ops/atten_ops.cpp         |  2 +-
 5 files changed, 15 insertions(+), 60 deletions(-)

diff --git a/attention_fusion/attention_fusion.json b/attention_fusion/attention_fusion.json
index 4a06e1b..130f1c6 100644
--- a/attention_fusion/attention_fusion.json
+++ b/attention_fusion/attention_fusion.json
@@ -73,6 +73,6 @@
               "type": "int",
               "default_value": 0
             }
-          ]
+        ]
     }
 ]
\ No newline at end of file
diff --git a/attention_fusion/op_host/attention_fusion.cpp b/attention_fusion/op_host/attention_fusion.cpp
index 71bd8e9..ec1dc7a 100644
--- a/attention_fusion/op_host/attention_fusion.cpp
+++ b/attention_fusion/op_host/attention_fusion.cpp
@@ -152,7 +152,6 @@ static ge::graphStatus InferShape(gert::InferShapeContext* context)
 
     return GRAPH_SUCCESS;
 }
-
 static ge::graphStatus InferDtype(gert::InferShapeContext* context)
 {
     context->SetOutputDataType(0, context->GetInputDataType(0));
diff --git a/attention_fusion/op_kernel/attention_fusion_kernel.h b/attention_fusion/op_kernel/attention_fusion_kernel.h
index 298cb9a..6c45362 100644
--- a/attention_fusion/op_kernel/attention_fusion_kernel.h
+++ b/attention_fusion/op_kernel/attention_fusion_kernel.h
@@ -19,16 +19,16 @@ struct AttentionFusionArgs {
     GM_ADDR softmaxOut;
     
     uint8_t normalizeAttr;
-    int queryDim1;
-    int queryDim2;
-    int keyDim1;
-    int keyDim2;
-    int valueDim1;
-    int valueDim2;
-    int batchNum;
-    int normalizeLoop;
-    int normalizeRow;
-    int normalizeColumn;
+    int32_t queryDim1;
+    int32_t queryDim2;
+    int32_t keyDim1;
+    int32_t keyDim2;
+    int32_t valueDim1;
+    int32_t valueDim2;
+    int32_t batchNum;
+    int32_t normalizeLoop;
+    int32_t normalizeRow;
+    int32_t normalizeColumn;
     float normalizeSqrt;
     uint64_t maxSharedTmpBuf;
 
diff --git a/attention_fusion/test/test.py b/attention_fusion/test/test.py
index ab0137a..573a305 100644
--- a/attention_fusion/test/test.py
+++ b/attention_fusion/test/test.py
@@ -14,46 +14,6 @@ import sys
 import npu_device
 from npu_device.compat.v1.npu_init import *
 
-loss = 1e-3
-minimum = 10e-10
-
-def softmax(src):
-    #基于last轴进行rowmax(按行取最大值)处理
-    max = np.max(src, axis=-1, keepdims=True)
-    sub = src - max
-    exp = np.exp(sub)
-    #基于last轴进行rowsum(按行求和)处理
-    sum = np.sum(exp, axis=-1, keepdims=True)
-    dst = exp / sum
-    return dst
-
-def gloden_atten_fusion(query, key, value, atten_mask):
-    qk = np.matmul(query, key.transpose(0, 2, 1))
-        
-    # @jiangli
-    print("attn_dim: ", query.shape[2])
-    attnDimSqrt = 1 / sqrt(query.shape[2])
-    attnWeight = np.multiply(qk, attnDimSqrt)
-    addMask = np.add(attnWeight, atten_mask)
-    qk_div = softmax(addMask)
-
-    out = np.matmul(qk_div, value)
-    return out, qk_div
-
-def verify_result(real_result, golden):
-    print(real_result[:32])
-    print(golden[:32])
-    result = np.abs(real_result - golden)
-    deno = np.maximum(np.abs(real_result), np.abs(golden))
-    result_atol = np.less_equal(result, loss)
-    result_rtol = np.less_equal(result / np.add(deno, minimum), loss)
-    if not result_rtol.all() and not result_atol.all():
-        if np.sum(result_rtol == False) > real_result.size * loss and np.sum(result_atol == False) > real_result.size * loss:
-            print("[ERROR] result error")
-            return False
-    print("test pass")
-    return True
-
 npu_device.compat.enable_v1()
 npu_init = npu_ops.initialize_system()
 npu_shutdown = npu_ops.shutdown_system()
@@ -74,19 +34,15 @@ key = tf.random_uniform([dim0, dim3, dim2], maxval = 10, dtype=tf.float32)
 value = tf.random_uniform([dim0, dim3, dim2], maxval = 10, dtype=tf.float32)
 mask = tf.random_uniform([dim0, dim1, dim3], maxval = 1, dtype=tf.float32)
 
-# golden_atten_score, gold_softmax_out = gloden_atten_fusion(query, key, value, mask)
-
 ret = tfOpLib.attention_fusion(query=query, key=key,value=value, atten_mask=mask)
 init = tf.compat.v1.global_variables_initializer()
 
 with tf.compat.v1.Session(config=config) as sess:
     sess.run(init)
     attenScore, softmaxOut = sess.run(ret)
-    print("============ attention fusion =============")
-    print("attenScore: ")
-    # verify_result(attenScore, golden_atten_score)
-    print("softmaxOut: ")
-    # verify_result(softmaxOut, gold_softmax_out)
+    print("attenScore: ", attenScore[:32])
+    print("softmaxOut: ", softmaxOut[:32])
+    print("============ attention fusion end =============")
 
     
diff --git a/attention_fusion/tf_ops/atten_ops.cpp b/attention_fusion/tf_ops/atten_ops.cpp
index c157c12..dbfc189 100644
--- a/attention_fusion/tf_ops/atten_ops.cpp
+++ b/attention_fusion/tf_ops/atten_ops.cpp
@@ -78,5 +78,5 @@ namespace tensorflow {
         c->set_output(1, c->MakeShape({shape0, shape1, shape2}));
         return Status::OK();
     });
-    REGISTER_KERNEL_BUILDER(Name("FusedAttention").Device(DEVICE_CPU), CustOps)
+    REGISTER_KERNEL_BUILDER(Name("AttentionFusion").Device(DEVICE_CPU), CustOps)
 }
\ No newline at end of file
-- 
Gitee


From 9a8944db44e381c55540da91347baa1d419dde11 Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Fri, 7 Jun 2024 10:59:57 +0800
Subject: [PATCH 07/11] update

---
 attention_fusion/op_host/attention_fusion.cpp |  1 +
 attention_fusion/test/test.py                 | 30 ++++++++++++++-----
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/attention_fusion/op_host/attention_fusion.cpp b/attention_fusion/op_host/attention_fusion.cpp
index ec1dc7a..56c2430 100644
--- a/attention_fusion/op_host/attention_fusion.cpp
+++ b/attention_fusion/op_host/attention_fusion.cpp
@@ -156,6 +156,7 @@ static ge::graphStatus InferDtype(gert::InferShapeContext* context)
 {
     context->SetOutputDataType(0, context->GetInputDataType(0));
     context->SetOutputDataType(1, context->GetInputDataType(1));
+    context->SetOutputDataType(2, context->GetInputDataType(2));
     return GRAPH_SUCCESS;
 }
 }
diff --git a/attention_fusion/test/test.py b/attention_fusion/test/test.py
index 573a305..8025317 100644
--- a/attention_fusion/test/test.py
+++ b/attention_fusion/test/test.py
@@ -1,7 +1,7 @@
 from mpi4py import MPI
 import os
 import numpy as np
-from math import sqrt
+from tensorflow.python.framework import ops
 
 os.environ["DEVICE_ID"] = str(0)
 os.environ["ASCEND_DEVICE_ID"] = str(0)
@@ -23,6 +23,23 @@ custom_op.name = "NpuOptimizer"
 config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
 config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
 
+def attention_fusion(query, key, value, atten_mask=None, attr=0)
+    attnOut, softmaxOut = tfOpLib.attention_fusion(query=query, key=key,value=value, atten_mask=atten_mask, mask_on = 1)
+    return attnOut, softmaxOut
+
+@ops.RegisterGradient("AttentionFusion")
+def _npu_fusion_attention_grad(op, *grad):
+    query = op.inputs[0]
+    key = op.inputs[1]
+    value = op.inputs[2]
+    atten_mask = op.inputs[3]
+
+    attention_out = op.outputs[0]
+    softmax_out = op.outputs[1]
+    dout = grad[0]
+    dQuery, dKey, dValue = tfOpLib.attention_fusion_grad(dout=dout, softmax_out=softmax_out, query=query, key=key, value=value)
+    return dQuery, dKey, dValue, tf.zeros(tf.shape(atten_mask))
+
 #测试用例
 dim0 = 1024
 dim1 = 1000
@@ -34,14 +51,13 @@ key = tf.random_uniform([dim0, dim3, dim2], maxval = 10, dtype=tf.float32)
 value = tf.random_uniform([dim0, dim3, dim2], maxval = 10, dtype=tf.float32)
 mask = tf.random_uniform([dim0, dim1, dim3], maxval = 1, dtype=tf.float32)
 
-ret = tfOpLib.attention_fusion(query=query, key=key,value=value, atten_mask=mask)
-init = tf.compat.v1.global_variables_initializer()
+atten_out, softmax_out = attention_fusion(query=query, key=key, value=value, atten_mask=mask, attr=1)
+loss_golden = tf.reduce_mean(atten_out, key_dims=False)
+grads_and_vars_golden = tf.gradients(loss_golden, [query, key, value])
 
 with tf.compat.v1.Session(config=config) as sess:
-    sess.run(init)
-    attenScore, softmaxOut = sess.run(ret)
-    print("attenScore: ", attenScore[:32])
-    print("softmaxOut: ", softmaxOut[:32])
+    sess.run(tf.compat.v1.global_variables_initializer() )
+    print(sess.run(grads_and_vars_golden))
     print("============ attention fusion end =============")
 
     
-- 
Gitee


From edd99b792bc84481552bf7ac5dc8154bfdf5686e Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Fri, 7 Jun 2024 11:13:33 +0800
Subject: [PATCH 08/11] update

---
 attention_fusion/test/test.py         |  2 +-
 attention_fusion/tf_ops/atten_ops.cpp | 43 +++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/attention_fusion/test/test.py b/attention_fusion/test/test.py
index 8025317..463f8fe 100644
--- a/attention_fusion/test/test.py
+++ b/attention_fusion/test/test.py
@@ -23,7 +23,7 @@ custom_op.name = "NpuOptimizer"
 config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
 config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
 
-def attention_fusion(query, key, value, atten_mask=None, attr=0)
+def attention_fusion(query, key, value, atten_mask=None, attr=0):
     attnOut, softmaxOut = tfOpLib.attention_fusion(query=query, key=key,value=value, atten_mask=atten_mask, mask_on = 1)
     return attnOut, softmaxOut
 
diff --git a/attention_fusion/tf_ops/atten_ops.cpp b/attention_fusion/tf_ops/atten_ops.cpp
index dbfc189..7c75928 100644
--- a/attention_fusion/tf_ops/atten_ops.cpp
+++ b/attention_fusion/tf_ops/atten_ops.cpp
@@ -79,4 +79,47 @@ namespace tensorflow {
         return Status::OK();
     });
     REGISTER_KERNEL_BUILDER(Name("AttentionFusion").Device(DEVICE_CPU), CustOps)
+
+    REGISTER_OP("AttentionFusionGrad")
+    .Input("dout: float")
+    .Input("softmax_out: float")
+    .Input("query: float")
+    .Input("key: float")
+    .Input("value: float")
+    .Output("grad_query: float")
+    .Output("grad_key: float")
+    .Output("grad_value: float")
+    .SetIsStateful()
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
+        ShapeHandle query_shape;
+        ShapeHandle key_shape;
+        ShapeHandle value_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 3, &query_shape));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 3, &key_shape));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 3, &value_shape));
+
+        tensorflow::shape_inference::DimensionHandle queryDim0 = c->Dim(query_shape, 0);
+        tensorflow::shape_inference::DimensionHandle queryDim1 = c->Dim(query_shape, 1);
+        tensorflow::shape_inference::DimensionHandle queryDim2 = c->Dim(query_shape, 2);
+        tensorflow::shape_inference::DimensionHandle keyDim1 = c->Dim(key_shape, 1);
+        tensorflow::shape_inference::DimensionHandle keyDim2 = c->Dim(key_shape, 2);
+        tensorflow::shape_inference::DimensionHandle valueDim1 = c->Dim(value_shape, 1);
+        tensorflow::shape_inference::DimensionHandle valueDim2 = c->Dim(value_shape, 2);
+
+        int64_t qShape0 = c->Value(queryDim0);
+        int64_t qShape1 = c->Value(queryDim1);
+        int64_t qShape2 = c->Value(queryDim2);
+
+        int64_t kShape1 = c->Value(keyDim1);
+        int64_t kShape2 = c->Value(keyDim2);
+
+        int64_t vShape1 = c->Value(valueDim1);
+        int64_t vShape2 = c->Value(valueDim2);
+
+        c->set_output(0, c->MakeShape({qShape0, qShape1, qShape2}));
+        c->set_output(1, c->MakeShape({qShape0, kShape1, kShape2}));
+        c->set_output(2, c->MakeShape({qShape0, vShape1, vShape2}));
+        return Status::OK();
+    });
+    REGISTER_KERNEL_BUILDER(Name("AttentionFusionGrad").Device(DEVICE_CPU), CustOps)
 }
\ No newline at end of file
-- 
Gitee


From 0e63c4c28b1a94eaa969d0f26da70f698e4183fa Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Fri, 7 Jun 2024 11:20:17 +0800
Subject: [PATCH 09/11] update

---
 attention_fusion/op_host/attention_fusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/attention_fusion/op_host/attention_fusion.cpp b/attention_fusion/op_host/attention_fusion.cpp
index 56c2430..94ef024 100644
--- a/attention_fusion/op_host/attention_fusion.cpp
+++ b/attention_fusion/op_host/attention_fusion.cpp
@@ -152,7 +152,7 @@ static ge::graphStatus InferShape(gert::InferShapeContext* context)
 
     return GRAPH_SUCCESS;
 }
-static ge::graphStatus InferDtype(gert::InferShapeContext* context)
+static ge::graphStatus InferDtype(gert::InferDataTypeContext* context)
 {
     context->SetOutputDataType(0, context->GetInputDataType(0));
     context->SetOutputDataType(1, context->GetInputDataType(1));
-- 
Gitee


From c5e90ee78eeb4929d1e22df3c54ac1fc3a9a6ceb Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Fri, 7 Jun 2024 11:21:23 +0800
Subject: [PATCH 10/11] update

---
 attention_fusion/op_host/attention_fusion.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/attention_fusion/op_host/attention_fusion.cpp b/attention_fusion/op_host/attention_fusion.cpp
index 94ef024..7f5da85 100644
--- a/attention_fusion/op_host/attention_fusion.cpp
+++ b/attention_fusion/op_host/attention_fusion.cpp
@@ -156,7 +156,6 @@ static ge::graphStatus InferDtype(gert::InferDataTypeContext* context)
 {
     context->SetOutputDataType(0, context->GetInputDataType(0));
     context->SetOutputDataType(1, context->GetInputDataType(1));
-    context->SetOutputDataType(2, context->GetInputDataType(2));
     return GRAPH_SUCCESS;
 }
 }
-- 
Gitee


From 89f0ec6b73ee31af3894dfee572e7945349df9a6 Mon Sep 17 00:00:00 2001
From: jiangli <jiangli67@huawei.com>
Date: Fri, 7 Jun 2024 11:25:40 +0800
Subject: [PATCH 11/11] update

---
 attention_fusion/test/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/attention_fusion/test/test.py b/attention_fusion/test/test.py
index 463f8fe..db62d1d 100644
--- a/attention_fusion/test/test.py
+++ b/attention_fusion/test/test.py
@@ -52,7 +52,7 @@ value = tf.random_uniform([dim0, dim3, dim2], maxval = 10, dtype=tf.float32)
 mask = tf.random_uniform([dim0, dim1, dim3], maxval = 1, dtype=tf.float32)
 
 atten_out, softmax_out = attention_fusion(query=query, key=key, value=value, atten_mask=mask, attr=1)
-loss_golden = tf.reduce_mean(atten_out, key_dims=False)
+loss_golden = tf.reduce_mean(atten_out, keep_dims=False)
 grads_and_vars_golden = tf.gradients(loss_golden, [query, key, value])
 
 with tf.compat.v1.Session(config=config) as sess:
-- 
Gitee