diff --git a/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py b/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py index 09629ea0fc10153711791180bbc5462693a0b34d..00e54f56dfc4fd81005e1c298b19613aa033282e 100644 --- a/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py +++ b/attention_fusion/aclnn_attention_fusion/scripts/gen_data.py @@ -23,6 +23,7 @@ def gloden_atten_fusion(query, key, value, atten_mask): print("attn_dim: ", query.shape[2]) attnDimSqrt = 1 / sqrt(query.shape[2]) attnWeight = np.multiply(qk, attnDimSqrt) + atten_mask = np.add(10000, np.multiply(atten_mask, -10000)) addMask = np.add(attnWeight, atten_mask) qk_div = softmax(addMask) @@ -37,11 +38,12 @@ def gen_golden_data_simple(): # input_atten_mask = np.ones([1024, 1000, 50]).astype(np.float32) input_query = np.random.uniform(-1, 1, [1024, 1000, 80]).astype(np.float32) - input_key = np.random.uniform(-1, 1, [1024, 50, 80]).astype(np.float32) - input_value = np.random.uniform(-1, 1, [1024, 50, 80]).astype(np.float32) - input_atten_mask = np.random.uniform(-1, 1, [1024, 1000, 50]).astype(np.float32) + input_key = np.random.uniform(-1, 1, [1024, 56, 80]).astype(np.float32) + input_value = np.random.uniform(-1, 1, [1024, 56, 80]).astype(np.float32) + input_atten_mask = np.random.randint(0,2,size=(1024, 1000, 56)).astype(np.float32) + + # input_atten_mask = np.random.uniform(-1, 1, [1024, 1000, 50]).astype(np.float32) - golden_atten_score, gold_softmax_out = gloden_atten_fusion(input_query, input_key, input_value, input_atten_mask) os.system("mkdir -p input") diff --git a/attention_fusion/aclnn_attention_fusion/src/main.cpp b/attention_fusion/aclnn_attention_fusion/src/main.cpp index 76a8f61c7038e83a1431d605d1ab264e8843fefa..2a7b5dacfaaf49b4ec06ce0d2af54b3bf8b43094 100644 --- a/attention_fusion/aclnn_attention_fusion/src/main.cpp +++ b/attention_fusion/aclnn_attention_fusion/src/main.cpp @@ -25,11 +25,11 @@ OperatorDesc CreateOpDesc() { // define operator std::vector shapeQuery { 1024, 1000, 80 }; - std::vector shapeKey { 1024, 50, 80 }; - std::vector shapeValue { 1024, 50, 80 }; - std::vector shapeAttenMask { 1024, 1000, 50 }; + std::vector shapeKey { 1024, 56, 80 }; + std::vector shapeValue { 1024, 56, 80 }; + std::vector shapeAttenMask { 1024, 1000, 56 }; std::vector shapeAttenScore{ 1024, 1000, 80 }; - std::vector shapeSoftmaxOut {1024, 1000, 50 }; + std::vector shapeSoftmaxOut {1024, 1000, 56 }; aclDataType dataTypeQuery = ACL_FLOAT; aclDataType dataTypeKey = ACL_FLOAT; aclDataType dataTypeValue = ACL_FLOAT; diff --git a/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp b/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp index a47a5ca5b07d2986b917ddb79ce7c71eec2bc6cf..0bba4a1b3d14066fd25d2aec906689015533b99e 100644 --- a/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp +++ b/attention_fusion/aclnn_attention_fusion/src/op_runner.cpp @@ -342,8 +342,9 @@ bool OpRunner::RunOp() auto beforeTime = std::chrono::steady_clock::now(); for (int i = 0; i<100; i++) { - ret = aclnnAttentionFusionGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], int(1), outputTensor_[0], outputTensor_[1], - &workspaceSize, &handle); + ret = aclnnAttentionFusionGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], + opDesc_->maskOnOptional, outputTensor_[0], outputTensor_[1], + &workspaceSize, &handle); ret = aclnnAttentionFusion(workspace, workspaceSize, handle, stream); } ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); diff --git a/attention_fusion/op_kernel/normalize_compute.h b/attention_fusion/op_kernel/normalize_compute.h index 122891e1109ead33b6418c66022529ad15a5712c..75a7ec85f391e93321d95f40313715300d697b05 100644 --- a/attention_fusion/op_kernel/normalize_compute.h +++ b/attention_fusion/op_kernel/normalize_compute.h @@ -29,8 +29,9 @@ public: __aicore__ inline void Init(NormalizeArgs normalArgs) { this->args = normalArgs; - args.pipe->InitBuffer(vecInQueue, 1, args.normalizeRow * args.normalizeColumn * sizeof(qType)); - args.pipe->InitBuffer(vecOutQueue, 1, args.normalizeRow * args.normalizeColumn * sizeof(qType)); + int bufSize = args.normalizeRow * args.normalizeColumn * sizeof(qType); + args.pipe->InitBuffer(vecInQueue, 1, bufSize); + args.pipe->InitBuffer(vecOutQueue, 1, bufSize); args.pipe->InitBuffer(vecSharedQueue, 1, args.maxSharedTmpBuf); } __aicore__ inline void Process(GlobalTensor softmaxGlobleTensor, GlobalTensor softmaxGbMask) @@ -82,10 +83,15 @@ private: LocalTensor LocalMask = vecSharedQueue.DeQue(); LocalTensor outLocalTensor = vecOutQueue.AllocTensor(); + // atten_weight = qkMatMul / sqrt(atten_dim) Muls(outLocalTensor, inLocalTensor, args.normalizeSqrt, totalSize); - // DataCopy(inLocalTensor, outLocalTensor, totalSize); + + // atten_mask = (1 - mask) * 10000 + Muls(LocalMask, LocalMask, (float)-10000, totalSize); + Adds(LocalMask, LocalMask, (float)10000, totalSize); + + // atten_weight = atten_weight + atten_mask Add(inLocalTensor, outLocalTensor, LocalMask, totalSize); - // DataCopy(inLocalTensor, outLocalTensor, totalSize); vecSharedQueue.FreeTensor(LocalMask); LocalTensor sharedTmpBuf = vecSharedQueue.AllocTensor();