diff --git a/build_test.sh b/build_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4c5552fc6499fcb34c8836f7151dd385fb135305
--- /dev/null
+++ b/build_test.sh
@@ -0,0 +1,3 @@
+bazel --output_user_root=../output run \
+--config=v2 --copt=-O2 --copt=-march=armv8-a --config=noaws --distdir=../proxy \
+//tensorflow/core/kernels:embedding_fused_ops_test --jobs=64
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index f74da7e88601dff35c8fc9b90150f43c8c15c606..7f78d63fab6a8deed67103ade5f81d1de7593066 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4133,7 +4133,7 @@ tf_kernel_library(
     srcs = if_enable_annc([
         "embedding_fused_sparse_segment_reduce_nonzero.cc",
     ]),
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + ["@com_google_absl//absl/container:flat_hash_map"],
 )
 
 tf_kernel_library(
@@ -4162,6 +4162,7 @@ tf_cc_test(
     name = "embedding_fused_ops_test",
     size = "small",
     srcs = if_enable_annc([
+        "embedding_fused_action_id_gather_test.cc",
         "embedding_fused_sparse_dynamic_stitch_test.cc",
         "embedding_fused_sparse_segment_reduce_test.cc",
         "embedding_fused_sparse_segment_reduce_nonzero_test.cc",
diff --git a/tensorflow/core/kernels/embedding_fused_action_id_gather_test.cc b/tensorflow/core/kernels/embedding_fused_action_id_gather_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1fc19aa8ecdd85d256fab048aa0fdc9fbded54fa
--- /dev/null
+++ b/tensorflow/core/kernels/embedding_fused_action_id_gather_test.cc
@@ -0,0 +1,284 @@
+/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+class KPFusedEmbeddingActionIdGatherTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType indices1_type, DataType indices2_type) {
+    TF_ASSERT_OK(NodeDefBuilder("fused_embedding_action_id_gather", "KPFusedEmbeddingActionIdGather")
+                     .Input(FakeInput(indices1_type))  // indices1
+                     .Input(FakeInput(DT_FLOAT))       // params
+                     .Input(FakeInput(indices2_type))  // indices2
+                     .Input(FakeInput(DT_INT32))       // pack_dim
+                     .Input(FakeInput(DT_INT32))       // pack
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+
+  template <typename Tindices1, typename Tindices2>
+  Status FeedAndRun(const std::vector<Tindices1>& indices1_data,
+                    const TensorShape& indices1_shape,
+                    const std::vector<float>& params_data,
+                    const TensorShape& params_shape,
+                    const std::vector<Tindices2>& indices2_data,
+                    const TensorShape& indices2_shape,
+                    int pack_dim_value,
+                    int pack_value) {
+    inputs_.clear();
+    input_types_.clear();
+    
+    MakeOp(DataTypeToEnum<Tindices1>::v(), DataTypeToEnum<Tindices2>::v());
+    AddInputFromArray<Tindices1>(indices1_shape, indices1_data);
+    AddInputFromArray<float>(params_shape, params_data);
+    AddInputFromArray<Tindices2>(indices2_shape, indices2_data);
+    AddInputFromArray<int32>(TensorShape({}), {pack_dim_value});
+    AddInputFromArray<int32>(TensorShape({}), {pack_value});
+    return RunOpKernel();
+  }
+};
+
+TEST_F(KPFusedEmbeddingActionIdGatherTest, NormalCase) {
+  std::vector<int64> indices1_data = {0, 2};
+  TensorShape indices1_shape({2, 1});
+  
+  std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  TensorShape params_shape({3, 2});
+  
+  std::vector<int32> indices2_data = {1, 0};
+  TensorShape indices2_shape({2, 1});
+  
+  int pack_dim_value = 2;
+  int pack_value = 1;
+  
+  TF_ASSERT_OK((FeedAndRun<int64, int32>(indices1_data, indices1_shape,
+                                        params_data, params_shape,
+                                        indices2_data, indices2_shape,
+                                        pack_dim_value, pack_value)));
+  
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected, {5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
+TEST_F(KPFusedEmbeddingActionIdGatherTest, DifferentIndexTypes) {
+  // 测试int64→int32
+  {
+    std::vector<int64> indices1 = {0, 2};
+    std::vector<int32> indices2 = {1, 0};
+    TF_ASSERT_OK((FeedAndRun<int64, int32>(indices1, {2, 1},
+                                          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {3, 2},
+                                          indices2, {2, 1}, 2, 1)));
+    test::ExpectTensorNear<float>(*GetOutput(0), 
+                                 test::AsTensor<float>({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), 
+                                 1e-5);
+  }
+
+  // 测试int32→int32
+  {
+    std::vector<int32> indices1 = {0, 2};
+    std::vector<int32> indices2 = {1, 0};
+    TF_ASSERT_OK((FeedAndRun<int32, int32>(indices1, {2, 1},
+                                          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {3, 2},
+                                          indices2, {2, 1}, 2, 1)));
+    test::ExpectTensorNear<float>(*GetOutput(0), 
+                                 test::AsTensor<float>({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), 
+                                 1e-5);
+  }
+
+  // 测试int64→int64
+  {
+    std::vector<int64> indices1 = {0, 2};
+    std::vector<int64> indices2 = {1, 0};
+    TF_ASSERT_OK((FeedAndRun<int64, int64>(indices1, {2, 1},
+                                          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {3, 2},
+                                          indices2, {2, 1}, 2, 1)));
+    test::ExpectTensorNear<float>(*GetOutput(0), 
+                                 test::AsTensor<float>({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), 
+                                 1e-5);
+  }
+
+  // 测试int32→int64
+  {
+    std::vector<int32> indices1 = {0, 2};
+    std::vector<int64> indices2 = {1, 0};
+    TF_ASSERT_OK((FeedAndRun<int32, int64>(indices1, {2, 1},
+                                          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {3, 2},
+                                          indices2, {2, 1}, 2, 1)));
+    test::ExpectTensorNear<float>(*GetOutput(0), 
+                                 test::AsTensor<float>({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), 
+                                 1e-5);
+  }
+}
+
+TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidIndices1Dims) {
+  MakeOp(DT_INT64, DT_INT32);
+  
+  std::vector<int64> indices1_data = {0, 2};
+  AddInputFromArray<int64>(TensorShape({2}), indices1_data); 
+  
+  std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  AddInputFromArray<float>(TensorShape({3, 2}), params_data);
+  
+  std::vector<int32> indices2_data = {1, 0};
+  AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data);
+  
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "indices1 dims must = 2")) << s;
+}
+
+TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidIndices2Dims) {
+  MakeOp(DT_INT64, DT_INT32);
+  
+  std::vector<int64> indices1_data = {0, 2};
+  AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data);
+  
+  std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  AddInputFromArray<float>(TensorShape({3, 2}), params_data);
+  
+  std::vector<int32> indices2_data = {1, 0};
+  AddInputFromArray<int32>(TensorShape({2}), indices2_data); 
+  
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "indices2 dims must = 2")) << s;
+}
+
+TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidParamsDims) {
+  MakeOp(DT_INT64, DT_INT32);
+  
+  std::vector<int64> indices1_data = {0, 2};
+  AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data);
+  
+  std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  AddInputFromArray<float>(TensorShape({4}), params_data); 
+  
+  std::vector<int32> indices2_data = {1, 0};
+  AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data);
+  
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "params dims must = 2")) << s;
+}
+
+TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidPackDimDims) {
+  MakeOp(DT_INT64, DT_INT32);
+  
+  std::vector<int64> indices1_data = {0, 2};
+  AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data);
+  
+  std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  AddInputFromArray<float>(TensorShape({3, 2}), params_data);
+  
+  std::vector<int32> indices2_data = {1, 0};
+  AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data);
+  
+  AddInputFromArray<int32>(TensorShape({1}), {2}); 
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "pack_dim is scalar")) << s;
+}
+
+TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidPackDims) {
+  MakeOp(DT_INT64, DT_INT32);
+  
+  std::vector<int64> indices1_data = {0, 2};
+  AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data);
+  
+  std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  AddInputFromArray<float>(TensorShape({3, 2}), params_data);
+  
+  std::vector<int32> indices2_data = {1, 0};
+  AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data);
+  
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({1}), {1}); 
+  
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "pack const is scalar")) << s;
+}
+
+TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidPackSize) {
+  MakeOp(DT_INT64, DT_INT32);
+  
+  std::vector<int64> indices1_data = {0, 2};
+  AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data);
+  
+  std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  AddInputFromArray<float>(TensorShape({3, 2}), params_data);
+  
+  std::vector<int32> indices2_data = {1, 0};
+  AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data);
+  
+  AddInputFromArray<int32>(TensorShape({}), {0}); 
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "pack_size must > 0")) << s;
+}
+
+TEST_F(KPFusedEmbeddingActionIdGatherTest, IndexOutOfRange) {
+  MakeOp(DT_INT64, DT_INT32);
+  
+  std::vector<int64> indices1_data = {0, 5}; 
+  AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data);
+  
+  std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  AddInputFromArray<float>(TensorShape({3, 2}), params_data);
+  
+  std::vector<int32> indices2_data = {1, 0};
+  AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data);
+  
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "GatherV2 axis=0: index out of range")) << s;
+}
+
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc
index cd1d341b6edb51c5193c936d94b7b464ac008685..155e153a3bb0fe831285942a0afc08f88aaa9d62 100644
--- a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc
+++ b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/util/work_sharder.h"
+#include "absl/container/flat_hash_map.h"
 
 using namespace tensorflow;
 
@@ -59,9 +60,11 @@ public:
     auto slice_input_mat = slice_input.matrix<int64>();
 
  	// Calculate max segment_id
+    std::vector<int64> segment_ids(num_indices);
     int64 max_seg_id = 0;
     for (int64 i = 0; i < num_indices; ++i) {
       int64 seg_id = slice_input_mat(i, col);
+      segment_ids[i] = seg_id;
       if (seg_id > max_seg_id) {
         max_seg_id = seg_id;
       }
@@ -74,45 +77,53 @@ public:
         context, context->allocate_output(0, TensorShape({1}), &output_shape));
     output_shape->flat<int32>()(0) = static_cast<int32>(batch_size);
 
-    std::vector<std::pair<int64, float>> results(batch_size);
+    std::vector<std::pair<int64, float>> results;
     int64 num_nonzero = 0;
-    Tensor temp(DT_FLOAT, TensorShape({batch_size}));
-    temp.flat<float>().setZero();
-    auto temp_vec = temp.flat<float>();
+    absl::flat_hash_map<int64, float> segment_sums;
+    absl::flat_hash_map<int64, int32> segment_counts;
+    std::vector<int64> segment_order;
 
     if (is_mean_) {
-      Tensor counts(DT_INT32, TensorShape({batch_size}));
-      counts.flat<int32>().setZero();
-      auto counts_vec = counts.flat<int32>();
-
       for (int64 i = 0; i < num_indices; ++i) {
-        const int64 seg_id = slice_input_mat(i, col);
+        const int64 seg_id = segment_ids[i];
         const Tidx data_row = indices_vec(i);
-        counts_vec(seg_id) += 1;
-        temp_vec(seg_id) += input_data(data_row);
+        
+        if (segment_sums.find(seg_id) == segment_sums.end()) {
+          segment_order.push_back(seg_id);
+        }
+        
+        segment_sums[seg_id] += input_data(data_row);
+        segment_counts[seg_id] += 1;
       }
-  
-      for (int64 seg = 0; seg < batch_size; ++seg) {
-        const int32_t count = counts_vec(seg);
+
+      for (int64 seg_id : segment_order) {
+        const int32_t count = segment_counts[seg_id];
         if (count > 0) {
           const float inv_count = 1.0f / static_cast<float>(count);
-          float value = temp_vec(seg);
+          float value = segment_sums[seg_id];
           if (value != 0) {
-            results[num_nonzero++] = {seg, value * inv_count};
+            results.push_back({seg_id, value * inv_count});
+            num_nonzero++;
           }
         }
       }
     } else {
       for (int64 i = 0; i < num_indices; ++i) {
-        const int64 seg_id = slice_input_mat(i, col);
+        const int64 seg_id = segment_ids[i];
         const Tidx data_row = indices_vec(i);
-        temp_vec(seg_id) += input_data(data_row);
+        
+        if (segment_sums.find(seg_id) == segment_sums.end()) {
+          segment_order.push_back(seg_id);
+        }
+        
+        segment_sums[seg_id] += input_data(data_row);
       }
-  
-      for (int64 seg = 0; seg < batch_size; ++seg) {
-        float value = temp_vec(seg);
+
+      for (int64 seg_id : segment_order) {
+        float value = segment_sums[seg_id];
         if (value != 0) {
-          results[num_nonzero++] = {seg, value};
+          results.push_back({seg_id, value});
+          num_nonzero++;
         }
       }
     }