# ModelLink

**Repository Path**: guoyiwei111/ModelLink

## Basic Information

- **Project Name**: ModelLink
- **Description**: 昇腾大模型仓库 LLM
- **Primary Language**: Python
- **License**: MIT
- **Default Branch**: master
- **Homepage**: https://gitee.com/ascend/ModelLink/
- **GVP Project**: No

## Statistics

- **Stars**: 0
- **Forks**: 1197
- **Created**: 2024-02-23
- **Last Updated**: 2024-08-16

## Categories & Tags

**Categories**: Uncategorized

**Tags**: None

## README

#include "kernel_operator.h"
using namespace AscendC;
#define IS_2_BYTES_TYPE is_same<T, int16_t>::value || is_same<T, uint16_t>::value || is_same<T, half>::value || is_same<T, bfloat16_t>::value
#define IS_4_BYTES_TYPE is_same<T, int32_t>::value || is_same<T, uint32_t>::value || is_same<T, float>::value
#define IS_8_BYTES_TYPE is_same<T, int64_t>::value || is_same<T, uint64_t>::value || is_same<T, double>::value

constexpr int32_t BUFFER_NUM = 2; //这里后期是要改成2么？

template <typename Tp, Tp v>
struct integral_constant {
    static constexpr Tp value = v;
};

using true_type = integral_constant<bool, true>;
using false_type = integral_constant<bool, false>;

template <typename, typename>
struct is_same : public false_type {};

template <typename Tp>
struct is_same<Tp, Tp> : public true_type {};
namespace AscendC {

template <typename T>
class KernelMaskedSelectCustom {
public:
    __aicore__ inline KernelMaskedSelectCustom () {}

    __aicore__ inline void Init(GM_ADDR x, GM_ADDR mask, GM_ADDR y, GM_ADDR workspace, 
                                uint32_t blockLength,
                                uint32_t tileNum, uint32_t tileLength,
                                uint32_t lasttileLength, uint32_t formerNum,
                                uint32_t formerLength, uint32_t formertileNum,
                                uint32_t formertileLength,
                                uint32_t formerlasttileLength, uint32_t tailNum,
                                uint32_t tailLength, uint32_t tailtileNum,
                                uint32_t tailtileLength,
                                uint32_t taillasttileLength, uint32_t tilingKey, uint32_t blockDim)
    {   
        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
        __gm__ T* globalWorkTensor = (__gm__ T*)((__gm__ uint64_t*)workspace + blockDim * 8);
        this->blockDim = blockDim;
        if(tilingKey == 1){
            this->blockLength = blockLength;
            this->tileNum = tileNum * BUFFER_NUM;
            this->tileLength = tileLength / BUFFER_NUM;
            this->lasttileLength = lasttileLength / BUFFER_NUM;

            xGlobal.SetGlobalBuffer((__gm__ T*)x + this->blockLength * GetBlockIdx(), this->blockLength);
            maskGlobal.SetGlobalBuffer((__gm__ uint8_t*)mask + this->blockLength * GetBlockIdx(), this->blockLength);

// yGlobal.SetGlobalBuffer((__gm__ T*)y, this->blockLength);
            workGlobal.SetGlobalBuffer(globalWorkTensor + this->blockLength * GetBlockIdx(), this->blockLength);
            
        }else { //2
            this->formerNum = formerNum;
            this->formerLength = formerLength;
            this->formertileNum = formertileNum;
            this->formertileLength = formertileLength;
            this->formerlasttileLength = formerlasttileLength;

            this->tailNum = tailNum;
            this->tailLength = tailLength;
            this->tailtileNum = tailtileNum;
            this->tailtileLength = tailtileLength;
            this->taillasttileLength = taillasttileLength;

            if (GetBlockIdx() < this->formerNum) {  //分到大块核的处理
                this->tileLength = this->formertileLength / BUFFER_NUM;
                this->lasttileLength = this->formerlasttileLength / BUFFER_NUM;
                this->tileNum = this->formertileNum * BUFFER_NUM;
                xGlobal.SetGlobalBuffer((__gm__ T*)x + this->formerLength * GetBlockIdx(), this->formerLength);
                maskGlobal.SetGlobalBuffer((__gm__ uint8_t*)mask + this->formerLength * GetBlockIdx(), this->formerLength);
                // yGlobal.SetGlobalBuffer((__gm__ T*)y, this->formerLength);
                workGlobal.SetGlobalBuffer(globalWorkTensor + this->formerLength * GetBlockIdx(), this->formerLength + 1);

            } else {  //分到小块核的处理，需要处理的数据量比大核少alignNum个
                this->tileLength = this->tailtileLength / BUFFER_NUM;
                this->lasttileLength = this->taillasttileLength / BUFFER_NUM;
                this->tileNum = this->tailtileNum * BUFFER_NUM;

                xGlobal.SetGlobalBuffer(
                    (__gm__ T*)x + this->formerLength * this->formerNum +
                        this->tailLength * (GetBlockIdx() - this->formerNum),
                    this->tailLength);
                maskGlobal.SetGlobalBuffer(
                    (__gm__ uint8_t*)mask + this->formerLength * this->formerNum +
                        this->tailLength * (GetBlockIdx() - this->formerNum),
                    this->tailLength);
                // yGlobal.SetGlobalBuffer(
                //     (__gm__ T*)y,
                //     this->tailLength);
                workGlobal.SetGlobalBuffer(
                    globalWorkTensor + this->formerLength * this->formerNum +
                        this->tailLength * (GetBlockIdx() - this->formerNum),
                    this->tailLength);
            }
        }
        offsetGlobal.SetGlobalBuffer((__gm__ uint64_t*)workspace, GetBlockIdx());

        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(T));

pipe.InitBuffer(inQueueMask, BUFFER_NUM, this->tileLength * sizeof(uint8_t));
        pipe.InitBuffer(outQueueY, BUFFER_NUM, this->tileLength * sizeof(T));
        pipe.InitBuffer(moveQue, BUFFER_NUM, this->tileLength * sizeof(T));


        if constexpr (IS_8_BYTES_TYPE) {
            pipe.InitBuffer(maskCastBuf, this->tileLength * sizeof(float));
            pipe.InitBuffer(bitMaskBuf, this->tileLength * 2 / 8);
        } else {
            pipe.InitBuffer(maskCastBuf, this->tileLength * sizeof(half));
            pipe.InitBuffer(bitMaskBuf, this->tileLength / 8);
        }
    }

    __aicore__ inline void Process(GM_ADDR y)
    {
        int32_t loopCount = this->tileNum ;
        // int32_t loopCount = blockLength / tileLength;
        // int32_t tailLoopLength = blockLength % tileLength;
        //GYW 先处理可以整分的。
        for (int32_t i = 0; i < loopCount; ++i) {
            CopyIn(i);
            Compute(i);
            CopyOut2WorkSpace();
        }
        //workspace 写入 offset
        offsetGlobal.SetValue(GetBlockIdx()<<3, this->outOffset);
        DataCacheCleanAndInvalid<uint64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(offsetGlobal[GetBlockIdx()<<3]);
        SyncAll();
        uint64_t ind = 0;
        for (int32_t i = 0; i < GetBlockIdx(); i++) { 
            DataCacheCleanAndInvalid<uint64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(offsetGlobal[i<<3]);
            ind += offsetGlobal.GetValue(i<<3);
        }
        
        yGlobal.SetGlobalBuffer((__gm__ T*)y + ind, this->outOffset);
        
        //搬运至GM
        loopCount = this->outOffset / this->tileLength;
        int32_t tailLoopLength = this->outOffset % this->tileLength;
        //GYW 先处理可以整分的。
        for (int32_t i = 0; i < loopCount; ++i) {
            CopyInMove(i, this->tileLength);
            CopyOutMove(i, this->tileLength);
        }
        //剩余不能被整分处理
        if (tailLoopLength > 0) {
            CopyInMove(loopCount, tailLoopLength);
            CopyOutMove(loopCount, tailLoopLength);
        }

}
private:
    __aicore__ inline void CopyInMove(int32_t progress, int32_t length)
    {
        LocalTensor<T> xLocal = moveQue.AllocTensor<T>();
        
        // DataCopy(xLocal, workGlobal[ind + progress * (this->tileLength )], length);
        if constexpr (IS_8_BYTES_TYPE) {//int64 uint64 double
            DataCopyPadDoubleWord(xLocal, workGlobal[progress * (this->tileLength)], length);
        } else {
            DataCopyExtParams copyParams{1, static_cast<uint32_t>(length * sizeof(T)), 0, 0, 0};
            DataCopyPadExtParams<T> padParams{false, 0, 0, 0};
            DataCopyPad(xLocal, workGlobal[progress * (this->tileLength)], copyParams, padParams);
        }
        moveQue.EnQue(xLocal);
    }

    __aicore__ inline void CopyOutMove(int32_t progress,int32_t length)
    {
        LocalTensor<T> yLocal = moveQue.DeQue<T>();
        
        if constexpr (IS_8_BYTES_TYPE) {
            DataCopyPadDoubleWord(yGlobal[progress * (this->tileLength)], yLocal, length);
            
        } else {
            DataCopyExtParams copyParams{1, static_cast<uint32_t>(length * sizeof(T)), 0, 0, 0};
            DataCopyPad(yGlobal[progress * (this->tileLength)], yLocal, copyParams);
            
        }
        // DataCopy(yGlobal[progress * (this->tileLength)], yLocal, length);
        moveQue.FreeTensor(yLocal);
    }

    __aicore__ inline void CopyIn(int32_t progress)
    {
        // // std::cout << "[info] "<< GetBlockIdx() << "copy in ============ \t"<< progress << std::endl;

        LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
        LocalTensor<uint8_t> maskLocal = inQueueMask.AllocTensor<uint8_t>();
        uint32_t ind = progress * this->tileLength;
        uint32_t length = this->tileLength;
        if (BUFFER_NUM == 1 && progress != 0) {
            if (progress == this->tileNum - 1) {
                //如果只有一包，则搬运的起始地址为0，tileLength为实际分块的数据量
                length = this->lasttileLength;
                
            } 
        }
        if (BUFFER_NUM == 2 && progress != 0) {

if (progress == this->tileNum - 1) {
                //如果只有一包，则搬运的起始地址为0，tileLength为实际分块的数据量
                ind = (progress-1) * this->tileLength + this->lasttileLength;
                length = this->lasttileLength;
            } else if (progress == this->tileNum - 2){
                length = this->lasttileLength;
            }
        }


        if constexpr (IS_8_BYTES_TYPE) {//int64 uint64 double
            DataCopyPadDoubleWord(xLocal, xGlobal[ind], length);
        } else {
            DataCopyExtParams copyParams{1, static_cast<uint32_t>(length * sizeof(T)), 0, 0, 0};
            DataCopyPadExtParams<T> padParams{false, 0, 0, 0};
            DataCopyPad(xLocal, xGlobal[ind], copyParams, padParams);
        }

        {
            DataCopyExtParams copyParams{1, static_cast<uint32_t>(length), 0, 0, 0};
            DataCopyPadExtParams<uint8_t> padParams{false, 0, 0, 0};
            DataCopyPad(maskLocal, maskGlobal[ind], copyParams, padParams);
        }

        inQueueX.EnQue(xLocal);
        inQueueMask.EnQue(maskLocal);
    }

__aicore__ inline void GenerateMask(const LocalTensor<uint8_t>& mask, LocalTensor<uint8_t>& bitMask,uint32_t count)
    {
        LocalTensor<half> maskCastLocal = maskCastBuf.Get<half>();

        Duplicate(maskCastLocal, static_cast<half>(0), static_cast<int32_t>(this->tileLength));
        Cast(maskCastLocal, mask, RoundMode::CAST_NONE, count);
        PipeBarrier<PIPE_V>();

        if constexpr (IS_8_BYTES_TYPE) {
            LocalTensor<int16_t> maskCastInt16 = maskCastLocal.template ReinterpretCast<int16_t>();
            LocalTensor<int16_t> maskCastInt16Shift = maskCastLocal[this->tileLength].template ReinterpretCast<int16_t>();
            Cast(maskCastInt16, maskCastLocal, RoundMode::CAST_ROUND, this->tileLength);

            ShiftLeft(maskCastInt16Shift, maskCastInt16, static_cast<int16_t>(8), this->tileLength);
            Add(maskCastInt16Shift, maskCastInt16, maskCastInt16Shift, this->tileLength);

            Cast(maskCastLocal, maskCastInt16Shift.ReinterpretCast<uint8_t>(), RoundMode::CAST_NONE, this->tileLength * 2);
            CompareScalar(bitMask, maskCastLocal, static_cast<half>(1.0), CMPMODE::EQ, this->tileLength * 2);
        } else {
            CompareScalar(bitMask, maskCastLocal, static_cast<half>(1.0), CMPMODE::EQ, this->tileLength);
        }
    }

__aicore__ inline void GatherResult(LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal,
                                        const LocalTensor<uint8_t>& bitMaskLocal, int32_t count)
    {
        GatherMaskParams params;
        params.src0BlockStride = 1;
        params.repeatTimes = 1;
        params.src0RepeatStride = 8;
        params.src1RepeatStride = 1;

        if constexpr (IS_8_BYTES_TYPE) {
            uint32_t mask = count * 2;
            LocalTensor<uint32_t> bitMask = bitMaskLocal.ReinterpretCast<uint32_t>();
            LocalTensor<int32_t> dstCastLocal = dstLocal.template ReinterpretCast<int32_t>();
            LocalTensor<int32_t> srcCastLocal = srcLocal.template ReinterpretCast<int32_t>();
            GatherMask(dstCastLocal, srcCastLocal, bitMask, true, mask, params, rsvdCnt);
            // if(GetBlockIdx()==0){
            //     std::cout << "[info] =================="<< GetBlockIdx() << "rsvdCnt "<<rsvdCnt<< std::endl;
            // }
        } else if constexpr (IS_4_BYTES_TYPE) {
            uint32_t mask = count;
            LocalTensor<uint32_t> bitMask = bitMaskLocal.ReinterpretCast<uint32_t>();
            GatherMask(dstLocal, srcLocal, bitMask, true, mask, params, rsvdCnt);
        } else {
            uint32_t mask = count;
            LocalTensor<uint16_t> bitMask = bitMaskLocal.ReinterpretCast<uint16_t>();
            GatherMask(dstLocal, srcLocal, bitMask, true, mask, params, rsvdCnt);//rsvdCnt 最终有效元素个数
            
        }
    }

    __aicore__ inline void Compute(int32_t progress)
    {
        LocalTensor<T> xLocal = inQueueX.DeQue<T>();
        LocalTensor<uint8_t> maskLocal = inQueueMask.DeQue<uint8_t>();
        LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
        LocalTensor<uint8_t> bitMaskLocal = bitMaskBuf.Get<uint8_t>();// GYW  DeQue 和 GET区别？
        if(BUFFER_NUM == 1){
            if (progress == this->tileNum - 1) {
                if (progress == 0) {
                //如果只有一包，则搬运的起始地址为0，tileLength为实际分块的数据量
                    GenerateMask(maskLocal, bitMaskLocal, this->tileLength);
                    GatherResult(yLocal, xLocal, bitMaskLocal, this->tileLength);
                } else {
                    GenerateMask(maskLocal, bitMaskLocal, this->lasttileLength);
                    GatherResult(yLocal, xLocal, bitMaskLocal, this->lasttileLength);
                }
            } else {
                GenerateMask(maskLocal, bitMaskLocal, this->tileLength);
                GatherResult(yLocal, xLocal, bitMaskLocal, this->tileLength);

}
        }else if (BUFFER_NUM == 2){
            if (progress == this->tileNum - 2) {
                if (progress == 0) {
                //如果只有一包，则搬运的起始地址为0，tileLength为实际分块的数据量
                    GenerateMask(maskLocal, bitMaskLocal, this->tileLength);
                    GatherResult(yLocal, xLocal, bitMaskLocal, this->tileLength);
                } else {
                    GenerateMask(maskLocal, bitMaskLocal, this->lasttileLength);
                    GatherResult(yLocal, xLocal, bitMaskLocal, this->lasttileLength);
                }
            } else if (progress == this->tileNum - 1) {
                if (progress == 0) {
                //如果只有一包，则搬运的起始地址为0，tileLength为实际分块的数据量
                    GenerateMask(maskLocal, bitMaskLocal, this->tileLength);
                    GatherResult(yLocal, xLocal, bitMaskLocal, this->tileLength);
                } else {
                    GenerateMask(maskLocal, bitMaskLocal, this->lasttileLength);
                    GatherResult(yLocal, xLocal, bitMaskLocal, this->lasttileLength);
                }
            } else {
                GenerateMask(maskLocal, bitMaskLocal, this->tileLength);
                GatherResult(yLocal, xLocal, bitMaskLocal, this->tileLength);
            }
        }
        
        
        outQueueY.EnQue<T>(yLocal);
        inQueueX.FreeTensor(xLocal);
        inQueueMask.FreeTensor(maskLocal);

    }

    __aicore__ inline void DataCopyPadDoubleWord(const LocalTensor<T>& dstLocal, const GlobalTensor<T>& srcGlobal,
                                                int64_t count)
    {
        GlobalTensor<int32_t> srcCastGlobal;
        srcCastGlobal.SetGlobalBuffer((__gm__ int32_t*)srcGlobal.GetPhyAddr(), count * 2);//将GM 中 64 转成 32 * 2

        LocalTensor<int32_t> dstCastLocal = dstLocal.template ReinterpretCast<int32_t>();//将 ue转 int32

        DataCopyExtParams copyParams{1, static_cast<uint32_t>(count * 2 * sizeof(int32_t)), 0, 0, 0};
        DataCopyPadExtParams<int32_t> padParams{false, 0, 0, 0};
        DataCopyPad(dstCastLocal, srcCastGlobal, copyParams, padParams); 
    }

    __aicore__ inline void DataCopyPadDoubleWord(const GlobalTensor<T>& dstGlobal, const LocalTensor<T>& srcLocal,
                                                int64_t count)
    {
        GlobalTensor<int32_t> dstCastGlobal;

        dstCastGlobal.SetGlobalBuffer((__gm__ int32_t*)dstGlobal.GetPhyAddr(), count * 2);

        LocalTensor<int32_t> srcCastLocal = srcLocal.template ReinterpretCast<int32_t>();

        DataCopyExtParams copyParams{1, static_cast<uint32_t>(count * 2 * sizeof(int32_t)), 0, 0, 0};
        DataCopyPad(dstCastGlobal, srcCastLocal, copyParams);
    }

    __aicore__ inline void CopyOut2WorkSpace()
    {
        LocalTensor<T> yLocal = outQueueY.DeQue<T>();

        if constexpr (IS_8_BYTES_TYPE) {
            DataCopyPadDoubleWord(workGlobal[outOffset], yLocal, rsvdCnt / 2);
            outOffset += rsvdCnt / 2;
        } else {
            DataCopyExtParams copyParams{1, static_cast<uint32_t>(rsvdCnt * sizeof(T)), 0, 0, 0};
            DataCopyPad(workGlobal[outOffset], yLocal, copyParams);
            outOffset += rsvdCnt;
        }

        outQueueY.FreeTensor(yLocal);
    }

    
private:
    TPipe pipe;
    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueMask;
    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueY;
    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueDst;
    TQueBind<QuePosition::VECIN, QuePosition::VECOUT, BUFFER_NUM> moveQue;
    TBuf<TPosition::VECCALC> maskCastBuf;
    TBuf<TPosition::VECCALC> bitMaskBuf;

    GlobalTensor<T> xGlobal;
    GlobalTensor<T> yGlobal;
    GlobalTensor<uint8_t> maskGlobal;
    GlobalTensor<T> workGlobal;
    GlobalTensor<uint64_t> offsetGlobal;
    uint32_t blockDim;
    uint32_t blockLength;
    uint32_t tileNum;
    uint32_t tileLength;
    uint32_t lasttileLength;
    uint32_t formerNum;
    uint32_t formerLength;
    uint32_t formertileNum;
    uint32_t formertileLength;
    uint32_t formerlasttileLength;
    uint32_t tailNum;
    uint32_t tailLength;
    uint32_t tailtileNum;
    uint32_t tailtileLength;
    uint32_t taillasttileLength;

    uint64_t rsvdCnt = 0;
    uint64_t outOffset = 0;
};
} // namespace AscendC

extern "C" __global__ __aicore__ void masked_select_custom(GM_ADDR x, GM_ADDR mask, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling)
{
    
    GET_TILING_DATA(tiling_data, tiling);
    // GET_TILING_DATA(tiling_data, tiling);
    AscendC::KernelMaskedSelectCustom<int64_t> op;
    uint32_t tilingKey = 1;
    if (TILING_KEY_IS(1)) {
        tilingKey = 1;
    } else if (TILING_KEY_IS(2)) {
        tilingKey = 2;
    } else {
        tilingKey = 1;
    }
    GM_ADDR usrWorkspace = GetUserWorkspace(workspace); // 获取用户workspace指针。

    op.Init(x, mask, y, usrWorkspace, tiling_data.blockLength,
        tiling_data.tileNum, tiling_data.tileLength,
        tiling_data.lasttileLength, tiling_data.formerNum,
        tiling_data.formerLength, tiling_data.formertileNum,
        tiling_data.formertileLength, tiling_data.formerlasttileLength,
        tiling_data.tailNum, tiling_data.tailLength, tiling_data.tailtileNum,
        tiling_data.tailtileLength, tiling_data.taillasttileLength,
        tilingKey, tiling_data.blockDim);
    op.Process(y);
}

#ifndef __CCE_KT_TEST__
// call of kernel function
void masked_select_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *x, uint8_t *mask, uint8_t *y,
                    uint8_t* workspace, uint8_t* tiling)
{
    masked_select_custom<<<blockDim, l2ctrl, stream>>>(x, mask, y, workspace, tiling);
}
#endif