From 090fa757734d09b6fa92986250a35fdc7e2636d7 Mon Sep 17 00:00:00 2001 From: zhuxg33 <1074959344@qq.com> Date: Fri, 26 Sep 2025 13:00:32 +0800 Subject: [PATCH] add UT case for sglang kernel(get_num_kv_splits_triton) --- .../v0.4.8/test_get_num_kv_splits_triton.py | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 ascend/test/sglang/v0.4.8/test_get_num_kv_splits_triton.py diff --git a/ascend/test/sglang/v0.4.8/test_get_num_kv_splits_triton.py b/ascend/test/sglang/v0.4.8/test_get_num_kv_splits_triton.py new file mode 100644 index 0000000..366b16d --- /dev/null +++ b/ascend/test/sglang/v0.4.8/test_get_num_kv_splits_triton.py @@ -0,0 +1,84 @@ +import sys +import pytest +import triton +import torch +import triton.language as tl +import test_common +sys.path.append("..") + + +# source: python/sglang/srt/layers/attention/triton_backend.py +@triton.jit +def get_num_kv_splits_triton( + num_kv_splits_ptr, + seq_lens_ptr, + num_seq, + num_group, + num_head, + num_kv_head, + max_kv_splits, + device_core_count, + MAX_NUM_SEQ: tl.constexpr, +): + # TODO: this method is tunable, we need more online serving data to tune it + offs_seq = tl.arange(0, MAX_NUM_SEQ) + mask_seq = offs_seq < num_seq + + seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=0) + max_seq_len = tl.max(seq_lens) + seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=max_seq_len) + min_seq_len = tl.min(seq_lens) + if max_seq_len * 8 < min_seq_len * 10: + min_seq_len = max_seq_len + max_kv_splits_1 = tl.minimum(tl.cdiv(max_seq_len, min_seq_len), max_kv_splits) + kv_chunk_size_1 = tl.cdiv(max_seq_len, max_kv_splits_1) + + # NOTE: this is a hack to let num_kv_split grows up with seqlen gradually + ext_seq_len = tl.cast(max_seq_len, tl.float32) / 64.0 + ext_device_core_count = tl.cast( + device_core_count * tl.maximum(tl.log2(ext_seq_len), 1.0), tl.int32 + ) + block_h, num_kv_group = 16, num_head // num_kv_head + if num_kv_group == 1: + token_grid = num_seq * num_group * num_head + else: + # from triton_ops/decode_attention.py:_decode_grouped_att_m_fwd + block_h = tl.minimum(block_h, num_kv_group) + token_grid = num_seq * num_group * tl.cdiv(num_head, block_h) + max_kv_splits_2 = tl.minimum( + tl.cdiv(ext_device_core_count, token_grid), max_kv_splits + ) + kv_chunk_size_2 = tl.cdiv(max_seq_len, max_kv_splits_2) + + num_kv_splits = tl.maximum( + tl.cdiv(seq_lens, kv_chunk_size_1), tl.cdiv(seq_lens, kv_chunk_size_2) + ) + + offs_token = offs_seq * num_group + mask_token = offs_token < num_seq * num_group + for i in range(0, num_group): + tl.store(num_kv_splits_ptr + i + offs_token, num_kv_splits, mask=mask_token) + + +def test_get_num_kv_splits_triton(ptfile_path): + try: + data = torch.load(ptfile_path, map_location=torch.device('cpu'), weights_only=False) + except Exception as e: + pytest.fail(f"load file {ptfile_path} failed: {str(e)}") + + # ptfile format: + # [input_data] (dict): + # key : value + # [gpu_output] (dict): + # key : value + # [grid] : + # (1,) + input_data = test_common.convert_tensor_with_device_type(data["input_data"], device_type='npu') + + get_num_kv_splits_triton[data["grid"]](**input_data) + + # compare the results of GPU and NPU. + try: + test_common.compare_data_precision(data["gpu_output"], input_data, device_type='cpu') + except ValueError as e: + pytest.fail(f"The testcase failed") \ No newline at end of file -- Gitee