diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index a8d4b852ca3c248d31b247b4c2f6d2aa09647e51..d3c7c1759641ba1b4f3ddfb107553ed3e6ac644e 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -20,6 +20,7 @@ using dnnl::memory; using dnnl::primitive; using paddle::framework::DataLayout; using paddle::framework::ExecutionContext; +using paddle::platform::MatMulV2MKLDNNHandler; using paddle::platform::GetMKLDNNFormat; using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNGetDataType; @@ -107,114 +108,6 @@ std::vector GetInputStrides(const ExecutionContext& ctx, return strides; } -template -class MatMulV2MKLDNNHandler - : public paddle::platform::MKLDNNHandlerNoCachingT { - public: - MatMulV2MKLDNNHandler(const dnnl::engine engine, - paddle::platform::Place cpu_place, - const std::vector& x_org_dims, bool trans_x, - const std::vector& y_org_dims, bool trans_y, - bool is_output_fused, - const std::vector& x_strides_override, - const std::vector& y_strides_override) - : paddle::platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { - // M X K * K X N - std::vector x_dims(x_org_dims); - std::vector y_dims(y_org_dims); - - const int MB_idx = x_dims.size() - 3; - const int H_idx = x_dims.size() - 2; - const int W_idx = x_dims.size() - 1; - - if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); - if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); - - const memory::dim M = x_dims[H_idx]; - const memory::dim K = x_dims[W_idx]; - const memory::dim N = y_dims[W_idx]; - - std::vector x_strides(x_dims.size() - 3, 1); - std::vector y_strides(x_dims.size() - 3, 1); - std::vector out_strides(x_dims.size() - 3, 1); - std::vector out_ddims(x_dims.size() - 3, 1); - - x_strides.reserve(x_dims.size()); - y_strides.reserve(x_dims.size()); - out_strides.reserve(x_dims.size()); - - if (!x_strides_override.empty()) { - x_strides = x_strides_override; - } else { - if (!trans_x) { - x_strides.insert(x_strides.end(), {M * K, K, 1}); - } else { - x_strides.insert(x_strides.end(), {M * K, 1, M}); - } - } - - if (!y_strides_override.empty()) { - y_strides = y_strides_override; - } else { - if (!trans_y) { - y_strides.insert(y_strides.end(), {N * K, N, 1}); - } else { - y_strides.insert(y_strides.end(), {N * K, 1, K}); - } - } - - out_strides.insert(out_strides.end(), {M * N, N, 1}); - out_ddims.insert(out_ddims.end(), - {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); - - for (int i = x_dims.size() - 4; i >= 0; --i) { - out_ddims[i] = std::max(x_dims[i], y_dims[i]); - if (x_strides_override.empty()) { - x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; - } - if (y_strides_override.empty()) { - y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; - } - out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; - } - - if (is_output_fused) { - out_strides = FakeTransposeStrides(out_ddims); - } - - auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); - auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); - auto out_md = memory::desc(out_ddims, MKLDNNGetDataType(), out_strides); - - this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md); - } - - std::vector FakeTransposeStrides( - const std::vector& matmul_out_dims) const { - // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and - // transpose axis are: {0, 2, 1, 3} - std::vector transpose_axis = {0, 2, 1, 3}; - std::vector fake_strides(transpose_axis.size()); - int ndims = static_cast(transpose_axis.size()); - - int total_stride = 1; - - for (int i = ndims - 1; i >= 0; --i) { - fake_strides[transpose_axis[i]] = total_stride; - total_stride *= matmul_out_dims[transpose_axis[i]]; - } - - return fake_strides; - } - - std::shared_ptr AcquireWeightsMemory(const Tensor* input) { - const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), - to_void_cast(input_data)); - } -}; - bool IsOutputFused(const ExecutionContext& ctx) { auto& fused_reshape_Out = ctx.Attr>("fused_reshape_Out"); auto& fused_transpose_Out = ctx.Attr>("fused_transpose_Out"); diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 422944107fb28050df3bc02aee9a43ee0aaf3b4d..49c896ef80fcc2aecd240e66c3e45e91bf1031ff 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/mul_op.h" -#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace framework { @@ -32,19 +32,22 @@ namespace operators { using framework::DataLayout; using framework::DDim; using framework::ExecutionContext; +using framework::LoDTensor; using framework::Tensor; -using mkldnn::inner_product_forward; -using mkldnn::memory; -using mkldnn::prop_kind; -using mkldnn::stream; + +using platform::MatMulV2MKLDNNHandler; using platform::MKLDNNDeviceContext; using platform::to_void_cast; +using dnnl::inner_product_forward; +using dnnl::memory; +using dnnl::prop_kind; +using dnnl::stream; + template class MulPrimitiveFactory { public: - explicit MulPrimitiveFactory(const mkldnn::engine &engine) - : engine_(engine) {} + explicit MulPrimitiveFactory(const dnnl::engine &engine) : engine_(engine) {} inner_product_forward CreateMulPrimitive(const Tensor *x_input, const Tensor *y_input, @@ -99,15 +102,15 @@ class MulPrimitiveFactory { const memory::desc &dst_desc, void *src_data, const std::vector &scale) { auto mask = scale.size() > 1 ? 1 : 0; - mkldnn::primitive_attr attr; + dnnl::primitive_attr attr; attr.set_output_scales(mask, scale); auto src_mem = memory(src_desc, engine_, src_data); auto dst_mem = memory(dst_desc, engine_); - auto reorder_pd = mkldnn::reorder::primitive_desc(src_mem, dst_mem, attr); + auto reorder_pd = dnnl::reorder::primitive_desc(src_mem, dst_mem, attr); - auto reorder = mkldnn::reorder(reorder_pd); + auto reorder = dnnl::reorder(reorder_pd); auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); { @@ -132,9 +135,9 @@ class MulPrimitiveFactory { scale_y); } - mkldnn::primitive_attr CreateMulAttr(const ExecutionContext &ctx, - bool force_fp32_output) { - mkldnn::primitive_attr mul_attr; + dnnl::primitive_attr CreateMulAttr(const ExecutionContext &ctx, + bool force_fp32_output) { + dnnl::primitive_attr mul_attr; auto scale_y_data = ctx.Attr>("scale_y"); auto scale_x_data = ctx.Attr("scale_x"); @@ -185,9 +188,9 @@ class MulPrimitiveFactory { void Execute() { auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); - (*mul_).execute(astream, {{MKLDNN_ARG_SRC, *x_input_}, - {MKLDNN_ARG_WEIGHTS, *y_input_}, - {MKLDNN_ARG_DST, *output_}}); + (*mul_).execute(astream, {{DNNL_ARG_SRC, *x_input_}, + {DNNL_ARG_WEIGHTS, *y_input_}, + {DNNL_ARG_DST, *output_}}); astream.wait(); } @@ -268,7 +271,7 @@ class MulPrimitiveFactory { auto dst_mem = dst_data ? memory(dst_desc, engine_, dst_data) : memory(dst_desc, engine_); - auto reorder = mkldnn::reorder(src_mem, dst_mem); + auto reorder = dnnl::reorder(src_mem, dst_mem); auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); { @@ -289,7 +292,7 @@ class MulPrimitiveFactory { return Reorder(src_desc, dst_desc, to_void_cast(input_y->data())); } - const mkldnn::engine &engine_; + const dnnl::engine &engine_; paddle::optional x_input_; paddle::optional y_input_; paddle::optional output_; @@ -303,7 +306,7 @@ template std::shared_ptr> GetPrimitiveFactory( const MKLDNNDeviceContext &dev_ctx, const ExecutionContext &ctx, const Tensor *input_x, const Tensor *input_y, - const mkldnn::engine &mkldnn_engine) { + const dnnl::engine &mkldnn_engine) { std::string key = platform::CreateKey( dev_ctx, input_x->type(), framework::vectorize(input_x->dims()), input_y->type(), framework::vectorize(input_y->dims()), @@ -327,7 +330,7 @@ inner_product_forward GetMulPrimitive(const MKLDNNDeviceContext &dev_ctx, const ExecutionContext &ctx, const Tensor *input_x, const Tensor *input_y, Tensor *output, - const mkldnn::engine &mkldnn_engine) { + const dnnl::engine &mkldnn_engine) { constexpr bool is_int8 = std::is_same::value || std::is_same::value; bool force_fp32_output = ctx.Attr("force_fp32_output"); @@ -346,7 +349,7 @@ inner_product_forward GetMulPrimitive(const MKLDNNDeviceContext &dev_ctx, /* XT: input x data type, YT: input y data type */ template -class MulMKLDNNKernel : public framework::OpKernel { +class MulMKLDNNINT8Kernel : public framework::OpKernel { public: void Compute(const ExecutionContext &ctx) const override { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, @@ -372,17 +375,175 @@ class MulMKLDNNKernel : public framework::OpKernel { } }; +template +class MulMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); } + + protected: + void ExecuteMatMul(const ExecutionContext &ctx, + const MKLDNNDeviceContext &dev_ctx, + const dnnl::engine &onednn_engine, + const platform::Place &cpu_place, const Tensor *x, + const std::vector &x_dims, bool trans_x, + const Tensor *y, const std::vector &y_dims, + bool trans_y, Tensor *out) const { + static const std::vector vec_placeholder; + MatMulV2MKLDNNHandler handler(onednn_engine, ctx.GetPlace(), x_dims, + trans_x, y_dims, trans_y, false, + vec_placeholder, vec_placeholder); + + const auto src_memory_p = handler.AcquireSrcMemory(x); + const auto weights_memory_p = handler.AcquireWeightsMemory(y); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto &astream = MKLDNNDeviceContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + // plain output formats are enforced inside handler + out->set_format(platform::MKLDNNFormatForSize( + out->dims().size(), dnnl::memory::format_tag::nchw)); + } + + private: + void RunKernel(const ExecutionContext &ctx) const { + const auto &dev_ctx = ctx.template device_context(); + const auto &onednn_engine = dev_ctx.GetEngine(); + + const auto *x = ctx.Input("X"); + const auto *y = ctx.Input("Y"); + auto *out = ctx.Output("Out"); + + int x_num_col_dims = ctx.Attr("x_num_col_dims"); + int y_num_col_dims = ctx.Attr("y_num_col_dims"); + + const Tensor x_matrix = x->dims().size() > 2 + ? framework::ReshapeToMatrix(*x, x_num_col_dims) + : *x; + const Tensor y_matrix = y->dims().size() > 2 + ? framework::ReshapeToMatrix(*y, y_num_col_dims) + : *y; + + // adding mb dim because MatMulV2 handler needs it + std::vector y_dims(3, 1); + std::vector x_dims(3, 1); + + y_dims[1] = y_matrix.dims()[0]; + y_dims[2] = y_matrix.dims()[1]; + + x_dims[1] = x_matrix.dims()[0]; + x_dims[2] = x_matrix.dims()[1]; + + ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), &x_matrix, + x_dims, false, &y_matrix, y_dims, false, out); + } +}; + +template +class MulGradMKLDNNKernel : public MulMKLDNNKernel { + public: + void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); } + + private: + template + void RunKernel(const ExecutionContext &ctx) const { + const auto &dev_ctx = ctx.template device_context(); + const auto &onednn_engine = dev_ctx.GetEngine(); + + const auto *x = ctx.Input("X"); + const auto *y = ctx.Input("Y"); + const auto *dout = ctx.Input(framework::GradVarName("Out")); + + auto *dx = ctx.Output(framework::GradVarName("X")); + auto *dy = ctx.Output(framework::GradVarName("Y")); + + int x_num_col_dims = ctx.Attr("x_num_col_dims"); + int y_num_col_dims = ctx.Attr("y_num_col_dims"); + + const Tensor x_matrix = x->dims().size() > 2 + ? framework::ReshapeToMatrix(*x, x_num_col_dims) + : static_cast(*x); + const Tensor y_matrix = y->dims().size() > 2 + ? framework::ReshapeToMatrix(*y, y_num_col_dims) + : static_cast(*y); + + Tensor dout_matrix = *dout; + dout_matrix.Resize( + {framework::flatten_to_2d(x->dims(), x_num_col_dims)[0], + framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]}); + + // adding mb dim because MatMulV2 handler needs it + std::vector x_dims(3, 1); + std::vector y_dims(3, 1); + std::vector dout_dims(3, 1); + + x_dims[1] = x_matrix.dims()[0]; + x_dims[2] = x_matrix.dims()[1]; + + y_dims[1] = y_matrix.dims()[0]; + y_dims[2] = y_matrix.dims()[1]; + + dout_dims[1] = dout_matrix.dims()[0]; + dout_dims[2] = dout_matrix.dims()[1]; + + if (dx != nullptr) { + dx->set_lod(x->lod()); + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), + &dout_matrix, dout_dims, false, &y_matrix, y_dims, + true, static_cast(dx)); + } + if (dy != nullptr) { + dy->set_lod(y->lod()); + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), + &x_matrix, x_dims, true, &dout_matrix, dout_dims, + false, static_cast(dy)); + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace, U8, ops::kMULMKLDNNINT8, - ops::MulMKLDNNKernel); + ops::MulMKLDNNINT8Kernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace, S8, ops::kMULMKLDNNINT8, - ops::MulMKLDNNKernel); + ops::MulMKLDNNINT8Kernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace, + FP32, ops::kMULMKLDNNFP32, + ops::MulMKLDNNKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( + mul, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kMULMKLDNNFP32, + ops::MulMKLDNNKernel); REGISTER_OP_KERNEL(mul, MKLDNN, ::paddle::platform::CPUPlace, - ops::MulMKLDNNKernel); + ops::MulMKLDNNINT8Kernel, + ops::MulMKLDNNKernel, + ops::MulMKLDNNKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul_grad, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kMULMKLDNNFP32, + ops::MulGradMKLDNNKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( + mul_grad, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kMULMKLDNNFP32, + ops::MulGradMKLDNNKernel, + ops::MulGradMKLDNNKernel); diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 14291f84584308ffa7d5393b6e3fe5e47023ff77..691c394870ad4c6aa401e86246cfad50db30e9fa 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -113,6 +113,12 @@ class MulOp : public framework::OperatorWithKernel { if (input_data_type == framework::DataTypeTrait::DataType() || input_data_type == framework::DataTypeTrait::DataType()) { customized_type_value = kMULMKLDNNINT8; + } else if (input_data_type == + framework::DataTypeTrait< + paddle::platform::bfloat16>::DataType() || + input_data_type == + framework::DataTypeTrait::DataType()) { + customized_type_value = kMULMKLDNNFP32; } } #endif @@ -233,6 +239,36 @@ class MulGradOp : public framework::OperatorWithKernel { ctx->SetOutputDim(y_grad_name, y_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + this->CanMKLDNNBeUsed(ctx, input_data_type)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + + if (input_data_type == framework::DataTypeTrait::DataType() || + input_data_type == framework::DataTypeTrait::DataType()) { + customized_type_value = kMULMKLDNNINT8; + } else if (input_data_type == + framework::DataTypeTrait< + paddle::platform::bfloat16>::DataType() || + input_data_type == + framework::DataTypeTrait::DataType()) { + customized_type_value = kMULMKLDNNFP32; + } + } +#endif + + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library, customized_type_value); + } }; template diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h index 3a13e0576e3472700908ebb6884e7306760a24c6..0fb32cf4be8864bc3a3f5da5c6c4318a4f24fb0d 100644 --- a/paddle/fluid/operators/mul_op.h +++ b/paddle/fluid/operators/mul_op.h @@ -25,6 +25,7 @@ namespace operators { using Tensor = framework::Tensor; constexpr int kMULMKLDNNINT8 = 1; +constexpr int kMULMKLDNNFP32 = 2; template class MulKernel : public framework::OpKernel { diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 8262273b7ca7da47dc47a2e7a02fa1f40b9d4727..5ab2004617810b34276632fa487e8f12d7b3b915 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -109,7 +109,6 @@ register_unity_group(cc gaussian_random_batch_size_like_op.cc gaussian_random_op.cc mkldnn/gaussian_random_mkldnn_op.cc - grid_sampler_op.cc group_norm_op.cc gru_op.cc) register_unity_group(cc hash_op.cc @@ -187,14 +186,12 @@ register_unity_group(cc norm_op.cc one_hot_op.cc one_hot_v2_op.cc - p_norm_op.cc pad2d_op.cc pad3d_op.cc pad_constant_like_op.cc pad_op.cc) register_unity_group(cc modified_huber_loss_op.cc - mkldnn/mul_mkldnn_op.cc partial_sum_op.cc pixel_shuffle_op.cc pool_op.cc @@ -469,7 +466,6 @@ register_unity_group(cu nll_loss_op.cu norm_op.cu one_hot_op.cu - p_norm_op.cu pad2d_op.cu pad3d_op.cu pad_constant_like_op.cu diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 084b47bb3c7a3b3b0622a79a985ea97e3854b4be..ef216e48416f9da9453bc6456c1bc051615d4435 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -33,14 +33,14 @@ namespace platform { using framework::DataLayout; using framework::Tensor; using user_function = std::function(const float*)>; -using memory = mkldnn::memory; +using memory = dnnl::memory; template class MKLDNNHandlerNoCachingT { public: - MKLDNNHandlerNoCachingT(mkldnn::engine engine, platform::Place cpu_place) + MKLDNNHandlerNoCachingT(dnnl::engine engine, platform::Place cpu_place) : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { platform::MKLDNNDeviceContext::tls().log_lib_version(); } @@ -60,7 +60,7 @@ class MKLDNNHandlerNoCachingT { return std::make_shared(*bwd_w_pd_); } - std::shared_ptr AcquireSrcMemory( + std::shared_ptr AcquireSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(), @@ -68,33 +68,33 @@ class MKLDNNHandlerNoCachingT { } template - std::shared_ptr AcquireDstMemory(framework::Tensor* output) { + std::shared_ptr AcquireDstMemory(framework::Tensor* output) { T_out* ptr = output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); } template - std::shared_ptr AcquireDstMemory(void) { + std::shared_ptr AcquireDstMemory(void) { return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); } template - std::shared_ptr AcquireDstMemory( + std::shared_ptr AcquireDstMemory( const framework::Tensor* output) { const T_out* output_data = output->data(); return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), to_void_cast(output_data)); } - std::shared_ptr AcquireDiffDstMemory( + std::shared_ptr AcquireDiffDstMemory( const framework::Tensor* diffdst) { const T* ptr = diffdst->data(); return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(), to_void_cast(ptr)); } - std::shared_ptr AcquireDiffSrcMemory( + std::shared_ptr AcquireDiffSrcMemory( framework::Tensor* diffsrc) { T* ptr = diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); @@ -102,7 +102,7 @@ class MKLDNNHandlerNoCachingT { } // Buffer of given Tensor is used for oneDNN computation - std::shared_ptr AcquireDiffWeightsMemory( + std::shared_ptr AcquireDiffWeightsMemory( framework::Tensor* diff_weights) { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, @@ -115,7 +115,7 @@ class MKLDNNHandlerNoCachingT { } // Buffer is allocated by oneDNN to store computation results - std::shared_ptr AcquireDiffWeightsMemory(void) { + std::shared_ptr AcquireDiffWeightsMemory(void) { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( @@ -179,37 +179,36 @@ class MKLDNNHandlerNoCachingT { bwd_desc, engine_, *fwd_pd_); } - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, void* ptr) { - return std::make_shared(md, engine_, ptr); + std::shared_ptr AcquireMemoryFromPrimitive( + dnnl::memory::desc md, void* ptr) { + return std::make_shared(md, engine_, ptr); } - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md) { - return std::make_shared(md, engine_); + std::shared_ptr AcquireMemoryFromPrimitive( + dnnl::memory::desc md) { + return std::make_shared(md, engine_); } - void AcquireReorder(const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p) { + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p) { auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); + std::make_shared(*user_memory_p, *target_memory_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); + reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, + {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); } template - std::shared_ptr AcquireMemoryWithReorder( - const mkldnn::memory::desc& user_md, - const mkldnn::memory::desc& target_md, void* ptr, - const std::string& suffix, bool is_persistent = false, + std::shared_ptr AcquireMemoryWithReorder( + const dnnl::memory::desc& user_md, const dnnl::memory::desc& target_md, + void* ptr, bool is_persistent = false, std::function(const F*)> custom_reorder_func = {}) { - std::shared_ptr target_memory_p; + std::shared_ptr target_memory_p; if (custom_reorder_func) { auto reordered_data = custom_reorder_func(reinterpret_cast(ptr)); @@ -217,15 +216,15 @@ class MKLDNNHandlerNoCachingT { } auto user_memory_p = std::make_shared(user_md, engine_, ptr); if (user_md != target_md) { - target_memory_p = std::make_shared(target_md, engine_); + target_memory_p = std::make_shared(target_md, engine_); auto reorder_p = std::make_shared(*user_memory_p, *target_memory_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); + reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, + {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); } else { target_memory_p = user_memory_p; @@ -233,7 +232,7 @@ class MKLDNNHandlerNoCachingT { return target_memory_p; } - mkldnn::engine engine_; + dnnl::engine engine_; platform::Place place_; std::shared_ptr fwd_pd_; std::shared_ptr bwd_pd_; @@ -245,7 +244,7 @@ template class MKLDNNHandlerT { public: - MKLDNNHandlerT(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + MKLDNNHandlerT(const MKLDNNDeviceContext& dev_ctx, dnnl::engine engine, platform::Place cpu_place, const std::string& base_key) : dev_ctx_(dev_ctx), engine_(engine), @@ -294,7 +293,7 @@ class MKLDNNHandlerT { return backward_p; } - std::shared_ptr AcquireSrcMemory( + std::shared_ptr AcquireSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive( @@ -302,7 +301,7 @@ class MKLDNNHandlerT { } template - std::shared_ptr AcquireDstMemory(framework::Tensor* output) { + std::shared_ptr AcquireDstMemory(framework::Tensor* output) { T_out* ptr = output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr, @@ -310,12 +309,12 @@ class MKLDNNHandlerT { } template - std::shared_ptr AcquireDstMemory(void) { + std::shared_ptr AcquireDstMemory(void) { return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), "@dstt_mem_p"); } template - std::shared_ptr AcquireDstMemory( + std::shared_ptr AcquireDstMemory( const framework::Tensor* output) { const T_out* output_data = output->data(); return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), @@ -323,14 +322,14 @@ class MKLDNNHandlerT { "@bwd-dst_mem_p"); } - std::shared_ptr AcquireDiffDstMemory( + std::shared_ptr AcquireDiffDstMemory( const framework::Tensor* diffdst) { const T* ptr = diffdst->data(); return this->AcquireMemoryFromPrimitive( bwd_pd_->diff_dst_desc(), to_void_cast(ptr), "@diff_dst_mem_p"); } - std::shared_ptr AcquireDiffSrcMemory( + std::shared_ptr AcquireDiffSrcMemory( framework::Tensor* diffsrc) { T* ptr = diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); @@ -339,7 +338,7 @@ class MKLDNNHandlerT { } // Buffer of given Tensor is used for oneDNN computation - std::shared_ptr AcquireDiffWeightsMemory( + std::shared_ptr AcquireDiffWeightsMemory( framework::Tensor* diff_weights) { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, @@ -352,7 +351,7 @@ class MKLDNNHandlerT { } // Buffer is allocated by oneDNN to store computation results - std::shared_ptr AcquireDiffWeightsMemory(void) { + std::shared_ptr AcquireDiffWeightsMemory(void) { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( @@ -467,19 +466,19 @@ class MKLDNNHandlerT { } } - std::shared_ptr AcquireMemoryFromPrimitive( + std::shared_ptr AcquireMemoryFromPrimitive( const std::string& suffix) { - return std::static_pointer_cast( + return std::static_pointer_cast( dev_ctx_.GetBlob(key_ + suffix)); } - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, void* ptr, const std::string& suffix) { + std::shared_ptr AcquireMemoryFromPrimitive( + dnnl::memory::desc md, void* ptr, const std::string& suffix) { const auto local_key = key_ + suffix; auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - mem_p = std::make_shared(md, engine_, ptr); + mem_p = std::make_shared(md, engine_, ptr); dev_ctx_.SetBlob(local_key, mem_p); } else { mem_p->set_data_handle(ptr); @@ -487,46 +486,36 @@ class MKLDNNHandlerT { return mem_p; } - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, const std::string& suffix) { + std::shared_ptr AcquireMemoryFromPrimitive( + dnnl::memory::desc md, const std::string& suffix) { const auto local_key = key_ + suffix; auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - mem_p = std::make_shared(md, engine_); + mem_p = std::make_shared(md, engine_); dev_ctx_.SetBlob(local_key, mem_p); } return mem_p; } - void AcquireReorder(const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p, - const std::string& suffix) { - const auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - - if (reorder_p == nullptr) { - reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - } + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); + reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, + {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); } template - std::shared_ptr AcquireMemoryWithReorder( - const mkldnn::memory::desc& user_md, - const mkldnn::memory::desc& target_md, void* ptr, - const std::string& suffix, bool is_persistent = false, + std::shared_ptr AcquireMemoryWithReorder( + const dnnl::memory::desc& user_md, const dnnl::memory::desc& target_md, + void* ptr, const std::string& suffix, bool is_persistent = false, std::function(const F*)> custom_reorder_func = {}, const std::vector& scale_data = {1.0f}, int mask = 0) { const auto target_key = key_ + suffix + "_target"; @@ -546,7 +535,7 @@ class MKLDNNHandlerT { auto user_memory_p = std::make_shared(user_md, engine_, ptr); if (user_md != target_md) { - target_memory_p = std::make_shared(target_md, engine_); + target_memory_p = std::make_shared(target_md, engine_); dnnl::reorder::primitive_desc reorder_pdesc; if (is_int8()) { dnnl::primitive_attr attr; @@ -563,8 +552,8 @@ class MKLDNNHandlerT { auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); + reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, + {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); } else { target_memory_p = user_memory_p; @@ -578,27 +567,28 @@ class MKLDNNHandlerT { std::static_pointer_cast(dev_ctx_.GetBlob(user_key)); user_memory_p->set_data_handle(ptr); - auto reorder_p = std::static_pointer_cast( + // TODO(jczaja): Here we detect if reorder is cached it means it is needed + // need to change this to get rid of keys + auto reorder_p = std::static_pointer_cast( dev_ctx_.GetBlob(key_reorder_p)); if (reorder_p != nullptr) { platform::RecordEvent record_reorder("int_reorder", platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); + reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, + {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); } } return target_memory_p; } - std::shared_ptr AcquireMemory(const std::string& suffix) { + std::shared_ptr AcquireMemory(const std::string& suffix) { const auto local_key = key_ + suffix; - return std::static_pointer_cast( - dev_ctx_.GetBlob(local_key)); + return std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); } const MKLDNNDeviceContext& dev_ctx_; - mkldnn::engine engine_; + dnnl::engine engine_; platform::Place place_; std::string key_common_; std::string key_; @@ -612,9 +602,10 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT { public: BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, - const mkldnn::engine engine, platform::Place cpu_place, + const dnnl::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, - float scale_x, float scale_y, float scale_z) + float scale_x, float scale_y, float scale_z, + const dnnl::post_ops& post_ops = dnnl::post_ops()) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { PADDLE_ENFORCE_EQ( x->layout(), DataLayout::kMKLDNN, @@ -663,11 +654,12 @@ class BinaryMKLDNNHandler MKLDNNMemoryFormat::any); auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); + attributes.set_post_ops(post_ops); + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, dst_md); } - - std::shared_ptr AcquireSecondSrcMemory( + std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), @@ -712,7 +704,7 @@ class BroadcastDataMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT { public: BroadcastDataMKLDNNHandler(const dnnl::algorithm algo, - const mkldnn::engine engine, + const dnnl::engine engine, platform::Place cpu_place, const Tensor* out, const Tensor* x, float scale_x, float scale_y, const std::vector& input_dims) @@ -740,7 +732,7 @@ class BroadcastDataMKLDNNHandler } template - std::shared_ptr AcquireDstMemory(framework::Tensor* output) { + std::shared_ptr AcquireDstMemory(framework::Tensor* output) { T_out* ptr = output->mutable_data( this->place_, this->fwd_pd_->dst_desc().get_size()); memset(ptr, 0, this->fwd_pd_->dst_desc().get_size()); @@ -753,7 +745,7 @@ class ReductionMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT { public: ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p, - const float eps, const mkldnn::engine engine, + const float eps, const dnnl::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, std::vector y_tz, const dnnl::primitive_attr& attr = NULL) @@ -780,18 +772,126 @@ class ReductionMKLDNNHandler } }; +template +class MatMulV2MKLDNNHandler + : public paddle::platform::MKLDNNHandlerNoCachingT { + public: + MatMulV2MKLDNNHandler(const dnnl::engine engine, + paddle::platform::Place cpu_place, + const std::vector& x_org_dims, bool trans_x, + const std::vector& y_org_dims, bool trans_y, + bool is_output_fused, + const std::vector& x_strides_override, + const std::vector& y_strides_override) + : paddle::platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + // M X K * K X N + std::vector x_dims(x_org_dims); + std::vector y_dims(y_org_dims); + + const int MB_idx = x_dims.size() - 3; + const int H_idx = x_dims.size() - 2; + const int W_idx = x_dims.size() - 1; + + if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); + if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); + + const memory::dim M = x_dims[H_idx]; + const memory::dim K = x_dims[W_idx]; + const memory::dim N = y_dims[W_idx]; + + std::vector x_strides(x_dims.size() - 3, 1); + std::vector y_strides(x_dims.size() - 3, 1); + std::vector out_strides(x_dims.size() - 3, 1); + std::vector out_ddims(x_dims.size() - 3, 1); + + x_strides.reserve(x_dims.size()); + y_strides.reserve(x_dims.size()); + out_strides.reserve(x_dims.size()); + + if (!x_strides_override.empty()) { + x_strides = x_strides_override; + } else { + if (!trans_x) { + x_strides.insert(x_strides.end(), {M * K, K, 1}); + } else { + x_strides.insert(x_strides.end(), {M * K, 1, M}); + } + } + + if (!y_strides_override.empty()) { + y_strides = y_strides_override; + } else { + if (!trans_y) { + y_strides.insert(y_strides.end(), {N * K, N, 1}); + } else { + y_strides.insert(y_strides.end(), {N * K, 1, K}); + } + } + + out_strides.insert(out_strides.end(), {M * N, N, 1}); + out_ddims.insert(out_ddims.end(), + {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); + + for (int i = x_dims.size() - 4; i >= 0; --i) { + out_ddims[i] = std::max(x_dims[i], y_dims[i]); + if (x_strides_override.empty()) { + x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; + } + if (y_strides_override.empty()) { + y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; + } + out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; + } + + if (is_output_fused) { + out_strides = FakeTransposeStrides(out_ddims); + } + + auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); + auto out_md = memory::desc(out_ddims, MKLDNNGetDataType(), out_strides); + + this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md); + } + + std::vector FakeTransposeStrides( + const std::vector& matmul_out_dims) const { + // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and + // transpose axis are: {0, 2, 1, 3} + std::vector transpose_axis = {0, 2, 1, 3}; + std::vector fake_strides(transpose_axis.size()); + int ndims = static_cast(transpose_axis.size()); + + int total_stride = 1; + + for (int i = ndims - 1; i >= 0; --i) { + fake_strides[transpose_axis[i]] = total_stride; + total_stride *= matmul_out_dims[transpose_axis[i]]; + } + + return fake_strides; + } + + std::shared_ptr AcquireWeightsMemory(const Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), + to_void_cast(input_data)); + } +}; + template class ActivationMKLDNNHandler - : public MKLDNNHandlerNoCachingT { + : public MKLDNNHandlerNoCachingT { public: - ActivationMKLDNNHandler(mkldnn::algorithm algorithm, + ActivationMKLDNNHandler(dnnl::algorithm algorithm, const framework::ExecutionContext& ctx, - const mkldnn::engine engine, Place cpu_place, + const dnnl::engine engine, Place cpu_place, const framework::Tensor* in_x) - : platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; @@ -816,7 +916,7 @@ class ActivationMKLDNNHandler : ctx.Attr("max"); } else { // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { + if (algorithm == dnnl::algorithm::eltwise_swish) { std::swap(alpha, beta); } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { alpha = ctx.Attr("threshold"); @@ -832,24 +932,24 @@ class ActivationMKLDNNHandler auto src_tz = framework::vectorize(in_x->dims()); auto src_fmt = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); auto md = - mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), src_fmt); + dnnl::memory::desc(src_tz, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, algorithm, md, alpha, beta); } - ActivationMKLDNNHandler(mkldnn::algorithm algorithm, + ActivationMKLDNNHandler(dnnl::algorithm algorithm, const framework::ExecutionContext& ctx, - const mkldnn::engine engine, Place cpu_place, + const dnnl::engine engine, Place cpu_place, const framework::Tensor* in_x, const Tensor* out_grad) - : platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { + if (algorithm == dnnl::algorithm::eltwise_swish) { std::swap(alpha, beta); } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { alpha = ctx.Attr("threshold"); @@ -875,13 +975,13 @@ class ActivationMKLDNNHandler auto src_md = platform::MKLDNNMemDesc( dims, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, algorithm, src_md, alpha, beta); this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, alpha, beta); } - std::shared_ptr AcquireBackwardSrcMemory( + std::shared_ptr AcquireBackwardSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(), @@ -893,7 +993,7 @@ class ReorderMKLDNNHandler { public: ReorderMKLDNNHandler(std::vector& dims, // NOLINT framework::proto::VarType::Type vtype, - mkldnn::memory::data_type dtype, mkldnn::engine engine) + dnnl::memory::data_type dtype, dnnl::engine engine) : dims_(dims), vtype_(vtype), vtype_dst_(vtype), @@ -903,10 +1003,9 @@ class ReorderMKLDNNHandler { ReorderMKLDNNHandler(std::vector& dims, // NOLINT framework::proto::VarType::Type vtype, - mkldnn::memory::data_type dtype, + dnnl::memory::data_type dtype, framework::proto::VarType::Type vtype_dst, - mkldnn::memory::data_type dtype_dst, - mkldnn::engine engine) + dnnl::memory::data_type dtype_dst, dnnl::engine engine) : dims_(dims), vtype_(vtype), vtype_dst_(vtype_dst), @@ -914,56 +1013,56 @@ class ReorderMKLDNNHandler { dtype_dst_(dtype_dst), engine_(engine) {} - std::shared_ptr AcquireSrcMemory( - const MKLDNNMemoryFormat& fmt, void* ptr) { - auto md = mkldnn::memory::desc(dims_, dtype_, fmt); - return std::make_shared(md, engine_, ptr); + std::shared_ptr AcquireSrcMemory(const MKLDNNMemoryFormat& fmt, + void* ptr) { + auto md = dnnl::memory::desc(dims_, dtype_, fmt); + return std::make_shared(md, engine_, ptr); } - std::shared_ptr AcquireSubmemory( + std::shared_ptr AcquireSubmemory( const std::vector& dims, const std::vector& offset, - const std::shared_ptr& mem_p) { + const std::shared_ptr& mem_p) { auto sub_md = mem_p->get_desc().submemory_desc(dims, {offset}); - auto sub_mem_p = std::make_shared(sub_md, engine_, - mem_p->get_data_handle()); + auto sub_mem_p = std::make_shared(sub_md, engine_, + mem_p->get_data_handle()); return sub_mem_p; } - std::shared_ptr AcquireDstMemory( - framework::Tensor* output, const MKLDNNMemoryFormat& fmt, - platform::Place place) { + std::shared_ptr AcquireDstMemory(framework::Tensor* output, + const MKLDNNMemoryFormat& fmt, + platform::Place place) { auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt); auto dst_data = output->mutable_data(place, vtype_dst_, dst_md.get_size()); - return std::make_shared(dst_md, engine_, dst_data); + return std::make_shared(dst_md, engine_, dst_data); } - std::shared_ptr AcquireDstMemory( + std::shared_ptr AcquireDstMemory( framework::Tensor* output, const std::vector& dims, const MKLDNNMemoryFormat& fmt, platform::Place place) { auto dst_md = platform::MKLDNNMemDesc(dims, dtype_dst_, fmt); auto dst_data = output->mutable_data(place, vtype_dst_, dst_md.get_size()); - return std::make_shared(dst_md, engine_, dst_data); + return std::make_shared(dst_md, engine_, dst_data); } - std::shared_ptr AcquireReorder( - std::shared_ptr dst_memory_p, - std::shared_ptr src_memory_p) { - return std::make_shared(*(src_memory_p), *(dst_memory_p)); + std::shared_ptr AcquireReorder( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p) { + return std::make_shared(*(src_memory_p), *(dst_memory_p)); } private: std::vector dims_; framework::proto::VarType::Type vtype_, vtype_dst_; - mkldnn::memory::data_type dtype_, dtype_dst_; - mkldnn::engine engine_; + dnnl::memory::data_type dtype_, dtype_dst_; + dnnl::engine engine_; }; template static void SetDstMemoryQuantized( const framework::ExecutionContext& ctx, framework::Tensor* output, - std::vector dst_tz, const mkldnn::engine& engine, - std::shared_ptr& dst_md, // NOLINT - std::shared_ptr& dst_memory, // NOLINT + std::vector dst_tz, const dnnl::engine& engine, + std::shared_ptr& dst_md, // NOLINT + std::shared_ptr& dst_memory, // NOLINT MKLDNNMemoryFormat output_format) { T* output_data = output->mutable_data(ctx.GetPlace()); const size_t dst_dims = dst_tz.size(); @@ -979,9 +1078,9 @@ static void SetDstMemoryQuantized( {dst_tz}, paddle::framework::ToMKLDNNDataType( framework::DataTypeTrait::DataType()), dst_fmt); - dst_md.reset(new mkldnn::memory::desc(tmp_dst_md)); + dst_md.reset(new dnnl::memory::desc(tmp_dst_md)); dst_memory.reset( - new mkldnn::memory(*dst_md, engine, to_void_cast(output_data))); + new dnnl::memory(*dst_md, engine, to_void_cast(output_data))); } } // namespace platform diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py index 3a4dc8ed9afcc42501c6848a6a3f2b18260903be..bbabbaa00730991aa77e12167b6b4fd5ba91c779 100644 --- a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py @@ -83,7 +83,7 @@ class AutoMixedPrecisionListsBF16(object): bf16_initializer_list = {'fill_constant', 'uniform_random'} # always bf16 -bf16_list = {'elementwise_add', } +bf16_list = {'elementwise_add', 'mul'} # depends on the prev_op type gray_list = { diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py index 8db8b793597bd57e9343af9e37f3cf5150d96225..4324e582fc99135bb79804ac4a22b4d41c970c13 100644 --- a/python/paddle/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/test_fit_a_line.py @@ -24,10 +24,28 @@ import unittest import math import sys import os +import struct paddle.enable_static() +def convert_uint16_to_float(in_list): + in_list = numpy.asarray(in_list) + out = numpy.vectorize( + lambda x: struct.unpack('> 16)) + out = numpy.reshape(out, in_list.shape).view(numpy.uint16) + return out + + def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16): x = fluid.layers.data(name='x', shape=[13], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32') @@ -84,6 +102,8 @@ def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16): avg_loss_value, = exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_cost]) + if avg_loss_value.dtype == numpy.uint16: + avg_loss_value = convert_uint16_to_float(avg_loss_value) if avg_loss_value[0] < 10.0: if save_dirname is not None: paddle.static.save_inference_model( @@ -147,6 +167,10 @@ def infer(use_cuda, save_dirname=None, use_bf16=False): test_data = next(test_reader()) test_feat = numpy.array( [data[0] for data in test_data]).astype("float32") + + if use_bf16: + test_feat = convert_float_to_uint16(test_feat) + test_label = numpy.array( [data[1] for data in test_data]).astype("float32") @@ -154,6 +178,8 @@ def infer(use_cuda, save_dirname=None, use_bf16=False): results = exe.run(inference_program, feed={feed_target_names[0]: numpy.array(test_feat)}, fetch_list=fetch_targets) + if results[0].dtype == numpy.uint16: + results[0] = convert_uint16_to_float(results[0]) print("infer shape: ", results[0].shape) print("infer results: ", results[0]) print("ground truth: ", test_label) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py index 0c91868d30245b4dafbaeaf8fba576506758eb30..9265d5f7edfbbd120673ce42a7eb571a6afc7af3 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle import paddle.fluid.core as core from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci ''' @@ -159,4 +160,5 @@ class TestMKLDNNMulOpS8U8WithFlatten(TestMKLDNNMulOpS8S8WithFlatten): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..a0581d791209d327df09d5eb2f8e11f3972c4e2f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py @@ -0,0 +1,159 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from numpy.matrixlib import defmatrix +import paddle +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16, OpTestTool + + +@OpTestTool.skip_if_not_cpu_bf16() +class TestMulOneDNNOp(OpTest): + def setUp(self): + self.op_type = "mul" + self.attrs = {'use_mkldnn': True} + self.init_shapes_and_attrs() + + self.x_fp32 = np.random.random(self.x_shape).astype(np.float32) + self.y_fp32 = np.random.random(self.y_shape).astype(np.float32) + + self.x = self.x_fp32 + self.y = self.y_fp32 + + self.init_inputs_dtype() + + self.inputs = {'X': self.x, 'Y': self.y} + + output = np.dot( + np.reshape(self.x_fp32, self.np_x_shape), + np.reshape(self.y_fp32, self.np_y_shape)) + self.outputs = {'Out': np.reshape(output, self.out_shape)} + + def init_shapes_and_attrs(self): + self.x_shape = (20, 5) + self.y_shape = (5, 21) + + self.np_x_shape = (20, 5) + self.np_y_shape = (5, 21) + + self.out_shape = (20, 21) + + def init_inputs_dtype(self): + pass + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad(self): + self.check_grad_with_place(core.CPUPlace(), ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad_with_place(core.CPUPlace(), ['Y'], 'Out', set('X')) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place(core.CPUPlace(), ['X'], 'Out', set('Y')) + + +class TestMulXNumColDims2OneDNNOp(TestMulOneDNNOp): + def init_shapes_and_attrs(self): + self.x_shape = (6, 7, 5) + self.y_shape = (5, 21) + + self.np_x_shape = (42, 5) + self.np_y_shape = (5, 21) + + self.out_shape = (6, 7, 21) + + self.attrs["x_num_col_dims"] = 2 + + +class TestMulYNumColDims2OneDNNOp(TestMulOneDNNOp): + def init_shapes_and_attrs(self): + self.x_shape = (20, 6) + self.y_shape = (2, 3, 21) + + self.np_x_shape = (20, 6) + self.np_y_shape = (6, 21) + + self.out_shape = (20, 21) + + self.attrs["y_num_col_dims"] = 2 + + +class TestMulYAndXNumColDims2OneDNNOp(TestMulOneDNNOp): + def init_shapes_and_attrs(self): + self.x_shape = (10, 5, 6) + self.y_shape = (2, 3, 21) + + self.np_x_shape = (50, 6) + self.np_y_shape = (6, 21) + + self.out_shape = (10, 5, 21) + + self.attrs["x_num_col_dims"] = 2 + self.attrs["y_num_col_dims"] = 2 + + +class TestMulBF16OneDNNOp(TestMulOneDNNOp): + def init_inputs_dtype(self): + self.x = convert_float_to_uint16(self.x) + self.y = convert_float_to_uint16(self.y) + + def calculate_grads(self): + x_np = np.reshape(self.x_fp32, self.np_x_shape) + y_np = np.reshape(self.y_fp32, self.np_y_shape) + + self.dout = self.outputs['Out'] + self.dout_np = np.reshape(self.dout, (x_np.shape[0], y_np.shape[1])) + + y_np_trans = np.transpose(y_np, (1, 0)) + x_np_trans = np.transpose(x_np, (1, 0)) + + self.dx = np.matmul(self.dout_np, y_np_trans) + self.dy = np.matmul(x_np_trans, self.dout_np) + + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ['X', 'Y'], + 'Out', + user_defined_grads=[self.dx, self.dy], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + def test_check_grad_ingore_x(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ['Y'], + 'Out', + set('X'), + user_defined_grads=[self.dy], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + def test_check_grad_ingore_y(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ['X'], + 'Out', + set('Y'), + user_defined_grads=[self.dx], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main()