diff --git a/build_test.sh b/build_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..4c5552fc6499fcb34c8836f7151dd385fb135305 --- /dev/null +++ b/build_test.sh @@ -0,0 +1,3 @@ +bazel --output_user_root=../output run \ +--config=v2 --copt=-O2 --copt=-march=armv8-a --config=noaws --distdir=../proxy \ +//tensorflow/core/kernels:embedding_fused_ops_test --jobs=64 diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index f74da7e88601dff35c8fc9b90150f43c8c15c606..7f78d63fab6a8deed67103ade5f81d1de7593066 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -4133,7 +4133,7 @@ tf_kernel_library( srcs = if_enable_annc([ "embedding_fused_sparse_segment_reduce_nonzero.cc", ]), - deps = MATH_DEPS, + deps = MATH_DEPS + ["@com_google_absl//absl/container:flat_hash_map"], ) tf_kernel_library( @@ -4162,6 +4162,7 @@ tf_cc_test( name = "embedding_fused_ops_test", size = "small", srcs = if_enable_annc([ + "embedding_fused_action_id_gather_test.cc", "embedding_fused_sparse_dynamic_stitch_test.cc", "embedding_fused_sparse_segment_reduce_test.cc", "embedding_fused_sparse_segment_reduce_nonzero_test.cc", diff --git a/tensorflow/core/kernels/embedding_fused_action_id_gather_test.cc b/tensorflow/core/kernels/embedding_fused_action_id_gather_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1fc19aa8ecdd85d256fab048aa0fdc9fbded54fa --- /dev/null +++ b/tensorflow/core/kernels/embedding_fused_action_id_gather_test.cc @@ -0,0 +1,284 @@ +/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { + +class KPFusedEmbeddingActionIdGatherTest : public OpsTestBase { + protected: + void MakeOp(DataType indices1_type, DataType indices2_type) { + TF_ASSERT_OK(NodeDefBuilder("fused_embedding_action_id_gather", "KPFusedEmbeddingActionIdGather") + .Input(FakeInput(indices1_type)) // indices1 + .Input(FakeInput(DT_FLOAT)) // params + .Input(FakeInput(indices2_type)) // indices2 + .Input(FakeInput(DT_INT32)) // pack_dim + .Input(FakeInput(DT_INT32)) // pack + .Finalize(node_def())); + TF_ASSERT_OK(InitOp()); + } + + template + Status FeedAndRun(const std::vector& indices1_data, + const TensorShape& indices1_shape, + const std::vector& params_data, + const TensorShape& params_shape, + const std::vector& indices2_data, + const TensorShape& indices2_shape, + int pack_dim_value, + int pack_value) { + inputs_.clear(); + input_types_.clear(); + + MakeOp(DataTypeToEnum::v(), DataTypeToEnum::v()); + AddInputFromArray(indices1_shape, indices1_data); + AddInputFromArray(params_shape, params_data); + AddInputFromArray(indices2_shape, indices2_data); + AddInputFromArray(TensorShape({}), {pack_dim_value}); + AddInputFromArray(TensorShape({}), {pack_value}); + return RunOpKernel(); + } +}; + +TEST_F(KPFusedEmbeddingActionIdGatherTest, NormalCase) { + std::vector indices1_data = {0, 2}; + TensorShape indices1_shape({2, 1}); + + std::vector params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + TensorShape params_shape({3, 2}); + + std::vector indices2_data = {1, 0}; + TensorShape indices2_shape({2, 1}); + + int pack_dim_value = 2; + int pack_value = 1; + + TF_ASSERT_OK((FeedAndRun(indices1_data, indices1_shape, + params_data, params_shape, + indices2_data, indices2_shape, + pack_dim_value, pack_value))); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3})); + test::FillValues(&expected, {5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}); + test::ExpectTensorNear(expected, *GetOutput(0), 1e-5); +} + +TEST_F(KPFusedEmbeddingActionIdGatherTest, DifferentIndexTypes) { + // 测试int64→int32 + { + std::vector indices1 = {0, 2}; + std::vector indices2 = {1, 0}; + TF_ASSERT_OK((FeedAndRun(indices1, {2, 1}, + {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {3, 2}, + indices2, {2, 1}, 2, 1))); + test::ExpectTensorNear(*GetOutput(0), + test::AsTensor({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), + 1e-5); + } + + // 测试int32→int32 + { + std::vector indices1 = {0, 2}; + std::vector indices2 = {1, 0}; + TF_ASSERT_OK((FeedAndRun(indices1, {2, 1}, + {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {3, 2}, + indices2, {2, 1}, 2, 1))); + test::ExpectTensorNear(*GetOutput(0), + test::AsTensor({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), + 1e-5); + } + + // 测试int64→int64 + { + std::vector indices1 = {0, 2}; + std::vector indices2 = {1, 0}; + TF_ASSERT_OK((FeedAndRun(indices1, {2, 1}, + {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {3, 2}, + indices2, {2, 1}, 2, 1))); + test::ExpectTensorNear(*GetOutput(0), + test::AsTensor({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), + 1e-5); + } + + // 测试int32→int64 + { + std::vector indices1 = {0, 2}; + std::vector indices2 = {1, 0}; + TF_ASSERT_OK((FeedAndRun(indices1, {2, 1}, + {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {3, 2}, + indices2, {2, 1}, 2, 1))); + test::ExpectTensorNear(*GetOutput(0), + test::AsTensor({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), + 1e-5); + } +} + +TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidIndices1Dims) { + MakeOp(DT_INT64, DT_INT32); + + std::vector indices1_data = {0, 2}; + AddInputFromArray(TensorShape({2}), indices1_data); + + std::vector params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + AddInputFromArray(TensorShape({3, 2}), params_data); + + std::vector indices2_data = {1, 0}; + AddInputFromArray(TensorShape({2, 1}), indices2_data); + + AddInputFromArray(TensorShape({}), {2}); + AddInputFromArray(TensorShape({}), {1}); + + Status s = RunOpKernel(); + EXPECT_FALSE(s.ok()); + EXPECT_TRUE(absl::StrContains(s.ToString(), "indices1 dims must = 2")) << s; +} + +TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidIndices2Dims) { + MakeOp(DT_INT64, DT_INT32); + + std::vector indices1_data = {0, 2}; + AddInputFromArray(TensorShape({2, 1}), indices1_data); + + std::vector params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + AddInputFromArray(TensorShape({3, 2}), params_data); + + std::vector indices2_data = {1, 0}; + AddInputFromArray(TensorShape({2}), indices2_data); + + AddInputFromArray(TensorShape({}), {2}); + AddInputFromArray(TensorShape({}), {1}); + + Status s = RunOpKernel(); + EXPECT_FALSE(s.ok()); + EXPECT_TRUE(absl::StrContains(s.ToString(), "indices2 dims must = 2")) << s; +} + +TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidParamsDims) { + MakeOp(DT_INT64, DT_INT32); + + std::vector indices1_data = {0, 2}; + AddInputFromArray(TensorShape({2, 1}), indices1_data); + + std::vector params_data = {1.0f, 2.0f, 3.0f, 4.0f}; + AddInputFromArray(TensorShape({4}), params_data); + + std::vector indices2_data = {1, 0}; + AddInputFromArray(TensorShape({2, 1}), indices2_data); + + AddInputFromArray(TensorShape({}), {2}); + AddInputFromArray(TensorShape({}), {1}); + + Status s = RunOpKernel(); + EXPECT_FALSE(s.ok()); + EXPECT_TRUE(absl::StrContains(s.ToString(), "params dims must = 2")) << s; +} + +TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidPackDimDims) { + MakeOp(DT_INT64, DT_INT32); + + std::vector indices1_data = {0, 2}; + AddInputFromArray(TensorShape({2, 1}), indices1_data); + + std::vector params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + AddInputFromArray(TensorShape({3, 2}), params_data); + + std::vector indices2_data = {1, 0}; + AddInputFromArray(TensorShape({2, 1}), indices2_data); + + AddInputFromArray(TensorShape({1}), {2}); + AddInputFromArray(TensorShape({}), {1}); + + Status s = RunOpKernel(); + EXPECT_FALSE(s.ok()); + EXPECT_TRUE(absl::StrContains(s.ToString(), "pack_dim is scalar")) << s; +} + +TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidPackDims) { + MakeOp(DT_INT64, DT_INT32); + + std::vector indices1_data = {0, 2}; + AddInputFromArray(TensorShape({2, 1}), indices1_data); + + std::vector params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + AddInputFromArray(TensorShape({3, 2}), params_data); + + std::vector indices2_data = {1, 0}; + AddInputFromArray(TensorShape({2, 1}), indices2_data); + + AddInputFromArray(TensorShape({}), {2}); + AddInputFromArray(TensorShape({1}), {1}); + + Status s = RunOpKernel(); + EXPECT_FALSE(s.ok()); + EXPECT_TRUE(absl::StrContains(s.ToString(), "pack const is scalar")) << s; +} + +TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidPackSize) { + MakeOp(DT_INT64, DT_INT32); + + std::vector indices1_data = {0, 2}; + AddInputFromArray(TensorShape({2, 1}), indices1_data); + + std::vector params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + AddInputFromArray(TensorShape({3, 2}), params_data); + + std::vector indices2_data = {1, 0}; + AddInputFromArray(TensorShape({2, 1}), indices2_data); + + AddInputFromArray(TensorShape({}), {0}); + AddInputFromArray(TensorShape({}), {1}); + + Status s = RunOpKernel(); + EXPECT_FALSE(s.ok()); + EXPECT_TRUE(absl::StrContains(s.ToString(), "pack_size must > 0")) << s; +} + +TEST_F(KPFusedEmbeddingActionIdGatherTest, IndexOutOfRange) { + MakeOp(DT_INT64, DT_INT32); + + std::vector indices1_data = {0, 5}; + AddInputFromArray(TensorShape({2, 1}), indices1_data); + + std::vector params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + AddInputFromArray(TensorShape({3, 2}), params_data); + + std::vector indices2_data = {1, 0}; + AddInputFromArray(TensorShape({2, 1}), indices2_data); + + AddInputFromArray(TensorShape({}), {2}); + AddInputFromArray(TensorShape({}), {1}); + + Status s = RunOpKernel(); + EXPECT_FALSE(s.ok()); + EXPECT_TRUE(absl::StrContains(s.ToString(), "GatherV2 axis=0: index out of range")) << s; +} + +} // namespace tensorflow \ No newline at end of file diff --git a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc index cd1d341b6edb51c5193c936d94b7b464ac008685..155e153a3bb0fe831285942a0afc08f88aaa9d62 100644 --- a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc +++ b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/util/work_sharder.h" +#include "absl/container/flat_hash_map.h" using namespace tensorflow; @@ -59,9 +60,11 @@ public: auto slice_input_mat = slice_input.matrix(); // Calculate max segment_id + std::vector segment_ids(num_indices); int64 max_seg_id = 0; for (int64 i = 0; i < num_indices; ++i) { int64 seg_id = slice_input_mat(i, col); + segment_ids[i] = seg_id; if (seg_id > max_seg_id) { max_seg_id = seg_id; } @@ -74,45 +77,53 @@ public: context, context->allocate_output(0, TensorShape({1}), &output_shape)); output_shape->flat()(0) = static_cast(batch_size); - std::vector> results(batch_size); + std::vector> results; int64 num_nonzero = 0; - Tensor temp(DT_FLOAT, TensorShape({batch_size})); - temp.flat().setZero(); - auto temp_vec = temp.flat(); + absl::flat_hash_map segment_sums; + absl::flat_hash_map segment_counts; + std::vector segment_order; if (is_mean_) { - Tensor counts(DT_INT32, TensorShape({batch_size})); - counts.flat().setZero(); - auto counts_vec = counts.flat(); - for (int64 i = 0; i < num_indices; ++i) { - const int64 seg_id = slice_input_mat(i, col); + const int64 seg_id = segment_ids[i]; const Tidx data_row = indices_vec(i); - counts_vec(seg_id) += 1; - temp_vec(seg_id) += input_data(data_row); + + if (segment_sums.find(seg_id) == segment_sums.end()) { + segment_order.push_back(seg_id); + } + + segment_sums[seg_id] += input_data(data_row); + segment_counts[seg_id] += 1; } - - for (int64 seg = 0; seg < batch_size; ++seg) { - const int32_t count = counts_vec(seg); + + for (int64 seg_id : segment_order) { + const int32_t count = segment_counts[seg_id]; if (count > 0) { const float inv_count = 1.0f / static_cast(count); - float value = temp_vec(seg); + float value = segment_sums[seg_id]; if (value != 0) { - results[num_nonzero++] = {seg, value * inv_count}; + results.push_back({seg_id, value * inv_count}); + num_nonzero++; } } } } else { for (int64 i = 0; i < num_indices; ++i) { - const int64 seg_id = slice_input_mat(i, col); + const int64 seg_id = segment_ids[i]; const Tidx data_row = indices_vec(i); - temp_vec(seg_id) += input_data(data_row); + + if (segment_sums.find(seg_id) == segment_sums.end()) { + segment_order.push_back(seg_id); + } + + segment_sums[seg_id] += input_data(data_row); } - - for (int64 seg = 0; seg < batch_size; ++seg) { - float value = temp_vec(seg); + + for (int64 seg_id : segment_order) { + float value = segment_sums[seg_id]; if (value != 0) { - results[num_nonzero++] = {seg, value}; + results.push_back({seg_id, value}); + num_nonzero++; } } }