From 40a6788a02c28e1990591f8e04b6379a4282397f Mon Sep 17 00:00:00 2001 From: jiangli Date: Wed, 26 Jun 2024 15:14:56 +0800 Subject: [PATCH 1/4] add special method for shape 1024 1 64 1000, change cube multiply to vector multiply --- .../scripts/gen_data.py | 20 +++--- .../op_kernel/attention_fusion_kernel.h | 24 ++++--- attention_fusion/op_kernel/k_v_bmm_compute.h | 3 - .../aclnn_attention_fusion_grad/src/main.cpp | 3 - .../op_kernel/attention_fusion_grad_kernel.h | 24 ++++--- .../op_kernel/normalize_grad.h | 69 +++++++++++++++++-- attention_fusion_grad/op_kernel/v_s_mm_grad.h | 7 +- 7 files changed, 107 insertions(+), 43 deletions(-) diff --git a/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py b/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py index 95915a7..b781c1d 100644 --- a/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py +++ b/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py @@ -31,16 +31,16 @@ def gloden_atten_fusion(query, key, value, atten_mask): return out, qk_div def gen_golden_data_simple(): - input_query = np.ones([1024, 1000, 80]).astype(np.float32) - input_key = np.ones([1024, 50, 80]).astype(np.float32) - input_key = 0-input_key - input_value = np.ones([1024, 50, 80]).astype(np.float32) - input_atten_mask = np.ones([1024, 1000, 50]).astype(np.float32) - - # input_query = np.random.uniform(-1, 1, [1024, 1000, 80]).astype(np.float32) - # input_key = np.random.uniform(-1, 1, [1024, 50, 80]).astype(np.float32) - # input_value = np.random.uniform(-1, 1, [1024, 50, 80]).astype(np.float32) - # input_atten_mask = np.random.randint(0,2,size=(1024, 1000, 50)).astype(np.float32) + # input_query = np.ones([1024, 1000, 80]).astype(np.float32) + # input_key = np.ones([1024, 50, 80]).astype(np.float32) + # input_key = 0-input_key + # input_value = np.ones([1024, 50, 80]).astype(np.float32) + # input_atten_mask = np.ones([1024, 1000, 50]).astype(np.float32) + + input_query = np.random.uniform(-1, 1, [1024, 1000, 80]).astype(np.float32) + input_key = np.random.uniform(-1, 1, [1024, 50, 80]).astype(np.float32) + input_value = np.random.uniform(-1, 1, [1024, 50, 80]).astype(np.float32) + input_atten_mask = np.random.randint(0,2,size=(1024, 1000, 50)).astype(np.float32) # input_atten_mask = np.random.uniform(-1, 1, [1024, 1000, 50]).astype(np.float32) diff --git a/attention_fusion/op_kernel/attention_fusion_kernel.h b/attention_fusion/op_kernel/attention_fusion_kernel.h index a0eb638..43ad76f 100644 --- a/attention_fusion/op_kernel/attention_fusion_kernel.h +++ b/attention_fusion/op_kernel/attention_fusion_kernel.h @@ -121,15 +121,21 @@ class AttentionFusionKernel { GlobalTensor softmaxGbMaskThisCore; softmaxGbMaskThisCore.SetGlobalBuffer(reinterpret_cast<__gm__ qType*>(args.attnMask), batchLen * args.queryDim1 * args.keyDim1); - for(int i = 0; i < batchLen; i++) { - GlobalTensor softmaxOutGbTensor = - softmaxOutGbTensorThisCore[(batchOffset+i) * args.queryDim1 * args.keyDim1]; - GlobalTensor softmaxGbMaskTensor = - softmaxGbMaskThisCore[(batchOffset+i) * args.queryDim1 * args.keyDim1]; - /* normallize */ - normalizeCompute.Process(softmaxOutGbTensor, softmaxGbMaskTensor); - /* matmul */ - kvBmmCompute.ComputeOneBatch(i); + for(int i = 0; i < batchLen + 1; i++) { + if (i != batchLen) { + GlobalTensor softmaxOutGbTensor = + softmaxOutGbTensorThisCore[(batchOffset+i) * args.queryDim1 * args.keyDim1]; + GlobalTensor softmaxGbMaskTensor = + softmaxGbMaskThisCore[(batchOffset+i) * args.queryDim1 * args.keyDim1]; + /* normallize */ + normalizeCompute.Process(softmaxOutGbTensor, softmaxGbMaskTensor); + } + + if (i != 0) { + /* matmul */ + kvBmmCompute.ComputeOneBatch(i - 1); + } + } } diff --git a/attention_fusion/op_kernel/k_v_bmm_compute.h b/attention_fusion/op_kernel/k_v_bmm_compute.h index c4ced5b..a87d17c 100644 --- a/attention_fusion/op_kernel/k_v_bmm_compute.h +++ b/attention_fusion/op_kernel/k_v_bmm_compute.h @@ -49,9 +49,6 @@ public: mm.End(); } - event_t evenId = static_cast(pipeArgs.pipe->FetchEventID(HardEvent::MTE3_MTE2)); - SetFlag(evenId); - WaitFlag(evenId); mm.SetTensorA(sGlobal[batchI * kvBmmArgs.M * kvBmmArgs.K]); mm.SetTensorB(vGlobal[batchI * kvBmmArgs.N * kvBmmArgs.K]); diff --git a/attention_fusion_grad/aclnn_attention_fusion_grad/src/main.cpp b/attention_fusion_grad/aclnn_attention_fusion_grad/src/main.cpp index f1f3bbd..e351475 100644 --- a/attention_fusion_grad/aclnn_attention_fusion_grad/src/main.cpp +++ b/attention_fusion_grad/aclnn_attention_fusion_grad/src/main.cpp @@ -34,9 +34,6 @@ OperatorDesc CreateOpDesc() std::vector grad_key { 1024, 50, 80 }; std::vector grad_value { 1024, 50, 80 }; - - - aclFormat format = ACL_FORMAT_ND; OperatorDesc opDesc; opDesc.AddInputTensorDesc(ACL_FLOAT, dout.size(), dout.data(), format); diff --git a/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h b/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h index 56ed383..338296d 100644 --- a/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h +++ b/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h @@ -90,28 +90,36 @@ class AttentionFusionGradKernel { __aicore__ inline void Process() { NormalNizeMatmulFusion(); - } private: __aicore__ inline void NormalNizeMatmulFusion() { - NormGradArgs normGradArgs {args.softmaxOut, args.gradSoftmax, args.queryDim1, args.keyDim1, args.batchNum, - batchOffsetThisCore , batchLenThisCore, args.numOfNormalnizeOnce, args.paddingKeyDim1,args.attenDimSqrt, - args.keyDimAlign, args.softmaxtiling, args.confusionTransposeTilingData, - args.confusionTransposeTilingData1, args.confusionTransposeTilingData2, - args.confusionTransposeTilingData3}; + NormGradArgs normGradArgs { + args.dout, args.gradValue, args.softmaxOut, args.gradSoftmax, args.queryDim1, args.keyDim1, + args.valueDim1, args.valueDim2, args.batchNum, batchOffsetThisCore , batchLenThisCore, + args.numOfNormalnizeOnce, args.paddingKeyDim1, args.attenDimSqrt, args.keyDimAlign, args.softmaxtiling, + args.confusionTransposeTilingData, args.confusionTransposeTilingData1, + args.confusionTransposeTilingData2, args.confusionTransposeTilingData3 + }; NormGradPipeArgs normGradPipe {&pipe}; NormalGradCompute normalCompute; normalCompute.Init(normGradArgs, normGradPipe); - + bool specialCase = args.queryDim1 == 1 && args.keyDim1 == 1000 && args.valueDim2 == 80; for (int thisBatch = 0 ; thisBatch < normGradArgs.batchLen; thisBatch++) { vSmm.ProcessDS(thisBatch); + if (specialCase == true) { + normalCompute.ProcessDV(thisBatch); + } } + for (int thisBatch = 0 ; thisBatch < normGradArgs.batchLen; thisBatch++) { - vSmm.ProcessDV(thisBatch); + if (specialCase == false) { + vSmm.ProcessDV(thisBatch); + } + normalCompute.ProcessOneBatch(thisBatch); } for (int thisBatch = 0 ; thisBatch < normGradArgs.batchLen; thisBatch++) { diff --git a/attention_fusion_grad/op_kernel/normalize_grad.h b/attention_fusion_grad/op_kernel/normalize_grad.h index 6eb6065..06fd7c7 100644 --- a/attention_fusion_grad/op_kernel/normalize_grad.h +++ b/attention_fusion_grad/op_kernel/normalize_grad.h @@ -6,11 +6,15 @@ using namespace AscendC; struct NormGradArgs { + GM_ADDR dout; + GM_ADDR gradValue; GM_ADDR softmaxOut; GM_ADDR gradSoftmax; int sDim1; int sDim2; + int vDim1; + int vDim2; int batchNum; int batchOffset; @@ -40,12 +44,22 @@ public: __aicore__ inline void Init(NormGradArgs mmArgs, NormGradPipeArgs pipeArgs) { this->mmArgs = mmArgs; - softmaxOut.SetGlobalBuffer(reinterpret_cast<__gm__ tType*>(mmArgs.softmaxOut), mmArgs.batchNum * mmArgs.sDim1 * mmArgs.sDim2); + softmaxOut.SetGlobalBuffer(reinterpret_cast<__gm__ tType*>(mmArgs.softmaxOut), + mmArgs.batchNum * mmArgs.sDim1 * mmArgs.sDim2); softmaxOut = softmaxOut[mmArgs.batchOffset * mmArgs.sDim1 * mmArgs.sDim2]; - gradSoftmax.SetGlobalBuffer(reinterpret_cast<__gm__ tType*>(mmArgs.gradSoftmax), mmArgs.batchNum * mmArgs.sDim1 * mmArgs.sDim2); + gradSoftmax.SetGlobalBuffer(reinterpret_cast<__gm__ tType*>(mmArgs.gradSoftmax), + mmArgs.batchNum * mmArgs.sDim1 * mmArgs.sDim2); gradSoftmax = gradSoftmax[mmArgs.batchOffset * mmArgs.sDim1 * mmArgs.sDim2]; + dout.SetGlobalBuffer(reinterpret_cast<__gm__ tType*>(mmArgs.dout), + mmArgs.batchNum * mmArgs.sDim1 * mmArgs.vDim2); + dout = dout[mmArgs.batchOffset * mmArgs.sDim1 * mmArgs.vDim2]; + + gradValue.SetGlobalBuffer(reinterpret_cast<__gm__ tType*>(mmArgs.gradValue), + mmArgs.batchNum * mmArgs.vDim1 * mmArgs.vDim2); + gradValue = gradValue[mmArgs.batchOffset * mmArgs.vDim1 * mmArgs.vDim2]; + pipeArgs.pipe->InitBuffer(vecInQueue, 1, mmArgs.numOfNormalnizeOnce*mmArgs.paddingKeyDim1*sizeof(tType)); pipeArgs.pipe->InitBuffer(vecInGradQueue, 1, mmArgs.numOfNormalnizeOnce*mmArgs.paddingKeyDim1*sizeof(tType)); pipeArgs.pipe->InitBuffer(vecOutQueue, 1, mmArgs.numOfNormalnizeOnce*mmArgs.paddingKeyDim1*sizeof(tType)); @@ -74,7 +88,6 @@ public: DataCopy(sourceTensor, mindTensor, padSize); } - __aicore__ inline void DoUnPadLocal(LocalTensor& sourceTensor, LocalTensor& mindTensor, const ConfusionTransposeTiling* confusionTransposeTilingData2, const ConfusionTransposeTiling* confusionTransposeTilingData3) @@ -97,6 +110,54 @@ public: DataCopy(sourceTensor, mindTensor, padSize); } + __aicore__ inline void ProcessDV(uint32_t batchI) + { + GlobalTensor thisBatchSoftmaxGb = softmaxOut[batchI * mmArgs.sDim1 * mmArgs.sDim2]; + GlobalTensor thisBatchDoutGb = dout[batchI * mmArgs.sDim1 * mmArgs.vDim2]; + GlobalTensor gradValueGb = gradValue[batchI * mmArgs.vDim1 * mmArgs.vDim2]; + + int total = mmArgs.sDim1 * mmArgs.sDim2; + int remain = total; + + while (remain > 0) { + // caculate basic + int thisLen = mmArgs.numOfNormalnizeOnce * mmArgs.sDim2; + if (remain < thisLen) { + thisLen = remain; + } + int offset = total - remain; + + LocalTensor inLocalTensor = vecInQueue.AllocTensor(); + DataCopy(inLocalTensor, thisBatchDoutGb[offset], thisLen); + + LocalTensor inGradLocalTensor = vecInGradQueue.AllocTensor(); + + const uint32_t dstShape_[] {100, 80}; + const uint32_t srcShape_[] {1, 80}; + + BroadCast(inGradLocalTensor, inLocalTensor, dstShape_, srcShape_); + DataCopy(inLocalTensor, inGradLocalTensor, 1000 * 8); + + for (int i = 0; i < 10; i++) { + LocalTensor outLocalTensor = vecOutQueue.AllocTensor(); + LocalTensor mem = tmpBuff.Get(); + for (int j = 0; j < 100; j++) { + float v = *(thisBatchSoftmaxGb.GetPhyAddr() + i * 100 + j); + Duplicate(inGradLocalTensor[j * 80], v, 80); + } + + Mul(outLocalTensor, inGradLocalTensor, inLocalTensor, 8000); + DataCopy(gradValueGb[offset + i * 8000], outLocalTensor, 8000); + vecOutQueue.FreeTensor(outLocalTensor); + } + + vecInQueue.FreeTensor(inLocalTensor); + vecInGradQueue.FreeTensor(inGradLocalTensor); + + remain = remain - thisLen; + } + } + __aicore__ inline void ProcessOneBatch(uint32_t batchI) { struct DataCopyExtParams copyParams{0, 0, 0, 0, 0}; // 结构体DataCopyExtParams最后一个参数是rsv保留位 @@ -186,6 +247,6 @@ public: GlobalTensor softmaxOut; GlobalTensor gradSoftmax; GlobalTensor gradValue; - + GlobalTensor dout; }; #endif \ No newline at end of file diff --git a/attention_fusion_grad/op_kernel/v_s_mm_grad.h b/attention_fusion_grad/op_kernel/v_s_mm_grad.h index 8f9377f..d3e5786 100644 --- a/attention_fusion_grad/op_kernel/v_s_mm_grad.h +++ b/attention_fusion_grad/op_kernel/v_s_mm_grad.h @@ -76,19 +76,15 @@ public: mmGradV.SetTensorB(dout[batchI * mmArgs.sDim1 * mmArgs.vDim2]); mmGradV.template IterateAll(gradValue[batchI * mmArgs.vDim1 * mmArgs.vDim2], 0, false, true); - // mm.IterateAll(dB[batchI * mmArgs.vDim1 * mmArgs.vDim2], 0, false); } __aicore__ inline void ProcessDS(uint32_t batchI) { - if (batchI != 0) { - mmGradS.WaitIterateAll(); - mmGradS.End(); - } mmGradS.SetTensorA(dout[batchI * mmArgs.sDim1 * mmArgs.vDim2]); mmGradS.SetTensorB(value[batchI * mmArgs.vDim1 * mmArgs.vDim2], true); mmGradS.template IterateAll(gradS[batchI * mmArgs.sDim1 * mmArgs.sDim2], 0, false, true); + mmGradS.WaitIterateAll(); } matmul::Matmul< @@ -108,7 +104,6 @@ public: private: VSMmGradArgs mmArgs; VSMmGradPipeArgs pipeArg; - // C = A*B dA=C*BT (vec) dB=AT*C (Cube) GlobalTensor softmaxOut; GlobalTensor value; GlobalTensor dout; -- Gitee From 8f34650bcbef38ccae0b916a4c2a09c84e42631d Mon Sep 17 00:00:00 2001 From: jiangli Date: Wed, 26 Jun 2024 15:47:53 +0800 Subject: [PATCH 2/4] fix clean code --- .../op_kernel/attention_fusion_kernel.h | 7 +++---- .../op_kernel/attention_fusion_grad_kernel.h | 6 +++--- attention_fusion_grad/op_kernel/normalize_grad.h | 2 +- st_test/test/test.py | 13 ++++++------- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/attention_fusion/op_kernel/attention_fusion_kernel.h b/attention_fusion/op_kernel/attention_fusion_kernel.h index 43ad76f..ef482ee 100644 --- a/attention_fusion/op_kernel/attention_fusion_kernel.h +++ b/attention_fusion/op_kernel/attention_fusion_kernel.h @@ -124,9 +124,9 @@ class AttentionFusionKernel { for(int i = 0; i < batchLen + 1; i++) { if (i != batchLen) { GlobalTensor softmaxOutGbTensor = - softmaxOutGbTensorThisCore[(batchOffset+i) * args.queryDim1 * args.keyDim1]; + softmaxOutGbTensorThisCore[(batchOffset + i) * args.queryDim1 * args.keyDim1]; GlobalTensor softmaxGbMaskTensor = - softmaxGbMaskThisCore[(batchOffset+i) * args.queryDim1 * args.keyDim1]; + softmaxGbMaskThisCore[(batchOffset + i) * args.queryDim1 * args.keyDim1]; /* normallize */ normalizeCompute.Process(softmaxOutGbTensor, softmaxGbMaskTensor); } @@ -135,14 +135,13 @@ class AttentionFusionKernel { /* matmul */ kvBmmCompute.ComputeOneBatch(i - 1); } - } } __aicore__ inline void GetBatchOffsetAndLen(int batchNum, int& batchOffset, int& batchLen) { // batch offset - int blockLenPerCore = CeilDiv(batchNum, (GetBlockNum()*2)); + int blockLenPerCore = CeilDiv(batchNum, (GetBlockNum() * 2)); batchOffset = blockLenPerCore*GetBlockIdx(); batchLen = blockLenPerCore; if (batchOffset + batchLen > batchNum) { diff --git a/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h b/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h index 338296d..293f7f3 100644 --- a/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h +++ b/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h @@ -97,7 +97,7 @@ class AttentionFusionGradKernel { { NormGradArgs normGradArgs { args.dout, args.gradValue, args.softmaxOut, args.gradSoftmax, args.queryDim1, args.keyDim1, - args.valueDim1, args.valueDim2, args.batchNum, batchOffsetThisCore , batchLenThisCore, + args.valueDim1, args.valueDim2, args.batchNum, batchOffsetThisCore, batchLenThisCore, args.numOfNormalnizeOnce, args.paddingKeyDim1, args.attenDimSqrt, args.keyDimAlign, args.softmaxtiling, args.confusionTransposeTilingData, args.confusionTransposeTilingData1, args.confusionTransposeTilingData2, args.confusionTransposeTilingData3 @@ -131,8 +131,8 @@ class AttentionFusionGradKernel { __aicore__ inline void GetBatchOffsetAndLen(int batchNum, int& batchOffset, int& batchLen) { // batch offset - int blockLenPerCoreBase = batchNum / (GetBlockNum()*2); - int remain = batchNum % (GetBlockNum()*2); + int blockLenPerCoreBase = batchNum / (GetBlockNum() * 2); + int remain = batchNum % (GetBlockNum() * 2); if (GetBlockIdx() < remain) { batchLen = blockLenPerCoreBase + 1; batchOffset = GetBlockIdx() * batchLen; diff --git a/attention_fusion_grad/op_kernel/normalize_grad.h b/attention_fusion_grad/op_kernel/normalize_grad.h index 06fd7c7..df8b743 100644 --- a/attention_fusion_grad/op_kernel/normalize_grad.h +++ b/attention_fusion_grad/op_kernel/normalize_grad.h @@ -135,7 +135,7 @@ public: const uint32_t dstShape_[] {100, 80}; const uint32_t srcShape_[] {1, 80}; - BroadCast(inGradLocalTensor, inLocalTensor, dstShape_, srcShape_); + BroadCast(inGradLocalTensor, inLocalTensor, dstShape_, srcShape_); DataCopy(inLocalTensor, inGradLocalTensor, 1000 * 8); for (int i = 0; i < 10; i++) { diff --git a/st_test/test/test.py b/st_test/test/test.py index 9e6a2cf..6bd65b1 100644 --- a/st_test/test/test.py +++ b/st_test/test/test.py @@ -25,13 +25,11 @@ config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF def attention_fusion(q, k, v, mask=None): - tf_shape = tf.shape(k) - tf_shape_query = tf.shape(q) - + maskIsOn = 1 if mask is None: - mask = tf.zeros(tf_shape[0], tf_shape_query[1], tf_shape[1]) - - attn_out, softmax_out = tfOpLib.attention_fusion(query=q, key=k, value=v, atten_mask=mask, mask_on = 1) + mask = tf.zeros(()) + maskIsOn = 0 + attn_out, softmax_out = tfOpLib.attention_fusion(query=q, key=k, value=v, atten_mask=mask, mask_on = maskIsOn) return attn_out, softmax_out @@ -79,7 +77,8 @@ def generate_data(dim0, dim1, dim2, dim3, dim4): q = np.random.randn(dim0, dim1, dim2).astype(np.float32) k = np.random.randn(dim0, dim3, dim2).astype(np.float32) v = np.random.randn(dim0, dim3, dim4).astype(np.float32) - m = np.zeros((dim0, dim1, dim3)).astype(np.float32) + # m = np.zeros((dim0, dim1, dim3)).astype(np.float32) + m = np.random.randint(0, 2, size=(dim0, dim1, dim3)).astype(np.float32) return q, k, v, m -- Gitee From b783998d8555de0909a416169a234d9a1efc4494 Mon Sep 17 00:00:00 2001 From: jiangli Date: Thu, 27 Jun 2024 12:05:47 +0800 Subject: [PATCH 3/4] fix clean code; fix bug --- attention_fusion/op_kernel/normalize_compute.h | 2 +- .../op_host/attention_fusion_grad.cpp | 4 ++-- .../op_kernel/attention_fusion_grad.cpp | 1 - .../op_kernel/attention_fusion_grad_kernel.h | 18 +++++++++--------- .../op_kernel/normalize_grad.h | 10 ++++++---- attention_fusion_grad/op_kernel/v_s_mm_grad.h | 3 +-- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/attention_fusion/op_kernel/normalize_compute.h b/attention_fusion/op_kernel/normalize_compute.h index 47d7e25..a1f1680 100644 --- a/attention_fusion/op_kernel/normalize_compute.h +++ b/attention_fusion/op_kernel/normalize_compute.h @@ -119,7 +119,7 @@ private: if (args.attr == 1) { DataCopy(LocalMask, maskGloblePtr[offset], totalSize); } else if (args.attr == 2) { - DataCopy(LocalMask, maskGloblePtr[offset], totalSize); + DataCopy(LocalMask, maskGloblePtr[offset], totalSize); } else { DataCopyPad(LocalMask, maskGloblePtr[offset], copyParams, padParams); } diff --git a/attention_fusion_grad/op_host/attention_fusion_grad.cpp b/attention_fusion_grad/op_host/attention_fusion_grad.cpp index ffa5624..7aba4ef 100644 --- a/attention_fusion_grad/op_host/attention_fusion_grad.cpp +++ b/attention_fusion_grad/op_host/attention_fusion_grad.cpp @@ -128,7 +128,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) auto ascnedPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); size_t coreNum = ascnedPlatform.GetCoreNumAic(); - + uint64_t ub; ascnedPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub); ub = ub - RESERVER_UB_SIZE; @@ -152,7 +152,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) std::vector shapeVec3 = {50* 8, 16}; ge::Shape srcShape3(shapeVec3); AscendC::GetConfusionTransposeTilingInfo(srcShape3, 0, sizeof(float), 7, tilingData.confusionTransposeTilingData3); - + context->SetBlockDim(coreNum); tilingData.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tilingData.GetDataSize()); diff --git a/attention_fusion_grad/op_kernel/attention_fusion_grad.cpp b/attention_fusion_grad/op_kernel/attention_fusion_grad.cpp index 4829b98..39f762e 100644 --- a/attention_fusion_grad/op_kernel/attention_fusion_grad.cpp +++ b/attention_fusion_grad/op_kernel/attention_fusion_grad.cpp @@ -25,5 +25,4 @@ extern "C" __global__ __aicore__ void attention_fusion_grad(GM_ADDR dout, GM_ADD }; AttentionFusionGradKernel attentionGradKernel; attentionGradKernel.Compute(attentionFusionGradAgs); - } \ No newline at end of file diff --git a/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h b/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h index 293f7f3..82b0057 100644 --- a/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h +++ b/attention_fusion_grad/op_kernel/attention_fusion_grad_kernel.h @@ -60,14 +60,14 @@ class AttentionFusionGradKernel { this->args = args; GetBatchOffsetAndLen(args.batchNum, batchOffsetThisCore, batchLenThisCore); // Matmul Register - REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), vSmm.mmGradV, args.gardVMatmulTiling, vSmm.mmGradS, args.gardSMatmulTiling, qKmm.mmGradQ, args.gardQMatmulTiling, qKmm.mmGradK, args.gardKMatmulTiling); - // VSmm Initialize + // VSmm Initialize VSMmGradArgs vSmmArgs { - args.softmaxOut, args.value, args.dout, args.gradValue, args.gradSoftmax, - args.queryDim1, args.keyDim1, args.valueDim1, args.valueDim2, args.batchNum, + args.softmaxOut, args.value, args.dout, args.gradValue, args.gradSoftmax, + args.queryDim1, args.keyDim1, args.valueDim1, args.valueDim2, args.batchNum, batchOffsetThisCore, batchLenThisCore }; @@ -75,14 +75,14 @@ class AttentionFusionGradKernel { vSmm.Init(vSmmArgs, vSmmPieArgs); - // VSmm Initialize + // VSmm Initialize QKMmGradArgs QKmmArgs { - args.query, args.key, args.gradSoftmax, args.gradQuery, args.gradKey, - args.queryDim1, args.queryDim2, args.keyDim1, args.keyDim2, args.batchNum, + args.query, args.key, args.gradSoftmax, args.gradQuery, args.gradKey, + args.queryDim1, args.queryDim2, args.keyDim1, args.keyDim2, args.batchNum, batchOffsetThisCore, batchLenThisCore }; - - qKmm.Init(QKmmArgs); + + qKmm.Init(QKmmArgs); // Start compute Process(); } diff --git a/attention_fusion_grad/op_kernel/normalize_grad.h b/attention_fusion_grad/op_kernel/normalize_grad.h index df8b743..b59eeb9 100644 --- a/attention_fusion_grad/op_kernel/normalize_grad.h +++ b/attention_fusion_grad/op_kernel/normalize_grad.h @@ -129,14 +129,16 @@ public: LocalTensor inLocalTensor = vecInQueue.AllocTensor(); DataCopy(inLocalTensor, thisBatchDoutGb[offset], thisLen); + vecInQueue.EnQue(inLocalTensor); LocalTensor inGradLocalTensor = vecInGradQueue.AllocTensor(); const uint32_t dstShape_[] {100, 80}; const uint32_t srcShape_[] {1, 80}; - BroadCast(inGradLocalTensor, inLocalTensor, dstShape_, srcShape_); - DataCopy(inLocalTensor, inGradLocalTensor, 1000 * 8); + LocalTensor inLocalTensorCompute = vecInQueue.DeQue(); + BroadCast(inGradLocalTensor, inLocalTensorCompute, dstShape_, srcShape_); + DataCopy(inLocalTensorCompute, inGradLocalTensor, 1000 * 8); for (int i = 0; i < 10; i++) { LocalTensor outLocalTensor = vecOutQueue.AllocTensor(); @@ -146,12 +148,12 @@ public: Duplicate(inGradLocalTensor[j * 80], v, 80); } - Mul(outLocalTensor, inGradLocalTensor, inLocalTensor, 8000); + Mul(outLocalTensor, inGradLocalTensor, inLocalTensorCompute, 8000); DataCopy(gradValueGb[offset + i * 8000], outLocalTensor, 8000); vecOutQueue.FreeTensor(outLocalTensor); } - vecInQueue.FreeTensor(inLocalTensor); + vecInQueue.FreeTensor(inLocalTensorCompute); vecInGradQueue.FreeTensor(inGradLocalTensor); remain = remain - thisLen; diff --git a/attention_fusion_grad/op_kernel/v_s_mm_grad.h b/attention_fusion_grad/op_kernel/v_s_mm_grad.h index d3e5786..cabbae4 100644 --- a/attention_fusion_grad/op_kernel/v_s_mm_grad.h +++ b/attention_fusion_grad/op_kernel/v_s_mm_grad.h @@ -17,8 +17,7 @@ struct VSMmGradArgs { int sDim2; int vDim1; - int vDim2; - + int vDim2; int batchNum; int batchOffset; -- Gitee From 31de83cf9f6faf7233dc364ab9fb3f06c5244528 Mon Sep 17 00:00:00 2001 From: jiangli Date: Fri, 19 Jul 2024 11:53:07 +0800 Subject: [PATCH 4/4] update --- attention_fusion/op_host/attention_fusion.cpp | 4 ++-- attention_fusion/op_kernel/attention_fusion_kernel.h | 6 +++--- attention_fusion/op_kernel/normalize_compute.h | 2 -- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/attention_fusion/op_host/attention_fusion.cpp b/attention_fusion/op_host/attention_fusion.cpp index 1e5232f..8209a8c 100644 --- a/attention_fusion/op_host/attention_fusion.cpp +++ b/attention_fusion/op_host/attention_fusion.cpp @@ -84,7 +84,7 @@ static int32_t SoftmaxTiling(gert::TilingContext* context, AttentionFusionTiling } if ((sizeof(float) * normalizeColumn) > (ub / UB_TILES)) { - TEST_LOG("[ERROR] key dim1 too large, LocalWorkSize insufficient"); + TEST_LOG("[ERROR] key dim1 too large, LocalWorkspace insufficient."); return 1; } @@ -104,7 +104,7 @@ static int32_t SoftmaxTiling(gert::TilingContext* context, AttentionFusionTiling const ge::Shape softmaxShape({normalizeRow, normalizeColumn}); const uint32_t minLocalWorkSize = AscendC::GetSoftMaxMinTmpSize(softmaxShape, sizeof(float), false); if (minLocalWorkSize > maxLocalWorkSize) { - TEST_LOG("[ERROR] LocalWorkSize insufficient"); + TEST_LOG("[ERROR] LocalWorkspace for SoftMax insufficient."); return 1; } diff --git a/attention_fusion/op_kernel/attention_fusion_kernel.h b/attention_fusion/op_kernel/attention_fusion_kernel.h index ef482ee..71be4a9 100644 --- a/attention_fusion/op_kernel/attention_fusion_kernel.h +++ b/attention_fusion/op_kernel/attention_fusion_kernel.h @@ -82,9 +82,9 @@ class AttentionFusionKernel { qKBmmCompute.Init(qKBmmArgs, qKBmmPipeArgs); NormalizeArgs normalArgs { - &pipe, args.normalizeAttr, args.queryDim1, args.keyDim1, batchOffset, batchLen, args.normalizeLoop, - args.normalizeRow, args.normalizeColumn, args.maskIsOn, args.normalizeSqrt, args.maxSharedTmpBuf, - args.softMaxTilingData, args.confusionTransposeTilingData, args.confusionTransposeTilingData1, + &pipe, args.normalizeAttr, args.queryDim1, args.keyDim1, args.normalizeLoop, args.normalizeRow, + args.normalizeColumn, args.maskIsOn, args.normalizeSqrt, args.maxSharedTmpBuf, args.softMaxTilingData, + args.confusionTransposeTilingData, args.confusionTransposeTilingData1, args.confusionTransposeTilingData2, args.confusionTransposeTilingData3 }; normalizeCompute.Init(normalArgs); diff --git a/attention_fusion/op_kernel/normalize_compute.h b/attention_fusion/op_kernel/normalize_compute.h index a1f1680..b9ff761 100644 --- a/attention_fusion/op_kernel/normalize_compute.h +++ b/attention_fusion/op_kernel/normalize_compute.h @@ -11,8 +11,6 @@ struct NormalizeArgs { uint8_t attr; int queryDim1; int keyDim1; - int batchOffset; - int batchLen; int loopCount; int normalizeRow; int normalizeColumn; -- Gitee