From 090fa757734d09b6fa92986250a35fdc7e2636d7 Mon Sep 17 00:00:00 2001
From: zhuxg33 <1074959344@qq.com>
Date: Fri, 26 Sep 2025 13:00:32 +0800
Subject: [PATCH] add UT case for sglang kernel(get_num_kv_splits_triton)

---
 .../v0.4.8/test_get_num_kv_splits_triton.py   | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 ascend/test/sglang/v0.4.8/test_get_num_kv_splits_triton.py

diff --git a/ascend/test/sglang/v0.4.8/test_get_num_kv_splits_triton.py b/ascend/test/sglang/v0.4.8/test_get_num_kv_splits_triton.py
new file mode 100644
index 0000000..366b16d
--- /dev/null
+++ b/ascend/test/sglang/v0.4.8/test_get_num_kv_splits_triton.py
@@ -0,0 +1,84 @@
+import sys
+import pytest
+import triton
+import torch
+import triton.language as tl
+import test_common
+sys.path.append("..")
+
+
+# source: python/sglang/srt/layers/attention/triton_backend.py
+@triton.jit
+def get_num_kv_splits_triton(
+    num_kv_splits_ptr,
+    seq_lens_ptr,
+    num_seq,
+    num_group,
+    num_head,
+    num_kv_head,
+    max_kv_splits,
+    device_core_count,
+    MAX_NUM_SEQ: tl.constexpr,
+):
+    # TODO: this method is tunable, we need more online serving data to tune it
+    offs_seq = tl.arange(0, MAX_NUM_SEQ)
+    mask_seq = offs_seq < num_seq
+
+    seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=0)
+    max_seq_len = tl.max(seq_lens)
+    seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=max_seq_len)
+    min_seq_len = tl.min(seq_lens)
+    if max_seq_len * 8 < min_seq_len * 10:
+        min_seq_len = max_seq_len
+    max_kv_splits_1 = tl.minimum(tl.cdiv(max_seq_len, min_seq_len), max_kv_splits)
+    kv_chunk_size_1 = tl.cdiv(max_seq_len, max_kv_splits_1)
+
+    # NOTE: this is a hack to let num_kv_split grows up with seqlen gradually
+    ext_seq_len = tl.cast(max_seq_len, tl.float32) / 64.0
+    ext_device_core_count = tl.cast(
+        device_core_count * tl.maximum(tl.log2(ext_seq_len), 1.0), tl.int32
+    )
+    block_h, num_kv_group = 16, num_head // num_kv_head
+    if num_kv_group == 1:
+        token_grid = num_seq * num_group * num_head
+    else:
+        # from triton_ops/decode_attention.py:_decode_grouped_att_m_fwd
+        block_h = tl.minimum(block_h, num_kv_group)
+        token_grid = num_seq * num_group * tl.cdiv(num_head, block_h)
+    max_kv_splits_2 = tl.minimum(
+        tl.cdiv(ext_device_core_count, token_grid), max_kv_splits
+    )
+    kv_chunk_size_2 = tl.cdiv(max_seq_len, max_kv_splits_2)
+
+    num_kv_splits = tl.maximum(
+        tl.cdiv(seq_lens, kv_chunk_size_1), tl.cdiv(seq_lens, kv_chunk_size_2)
+    )
+
+    offs_token = offs_seq * num_group
+    mask_token = offs_token < num_seq * num_group
+    for i in range(0, num_group):
+        tl.store(num_kv_splits_ptr + i + offs_token, num_kv_splits, mask=mask_token)
+
+
+def test_get_num_kv_splits_triton(ptfile_path):
+    try:
+        data = torch.load(ptfile_path, map_location=torch.device('cpu'), weights_only=False)
+    except Exception as e:
+        pytest.fail(f"load file {ptfile_path} failed: {str(e)}")
+
+    # ptfile format:
+    # [input_data] (dict):
+    #     key : value
+    # [gpu_output] (dict):
+    #     key : value
+    # [grid] :
+    #     (1,)
+    input_data = test_common.convert_tensor_with_device_type(data["input_data"], device_type='npu')
+
+    get_num_kv_splits_triton[data["grid"]](**input_data)
+
+    # compare the results of GPU and NPU.
+    try:
+        test_common.compare_data_precision(data["gpu_output"], input_data, device_type='cpu')
+    except ValueError as e:
+        pytest.fail(f"The testcase failed")
\ No newline at end of file
-- 
Gitee