From 99e13f68d80762255345f9ed35c85a1e0c19ff8a Mon Sep 17 00:00:00 2001 From: mhh001 Date: Thu, 25 Sep 2025 10:49:50 +0800 Subject: [PATCH 1/3] [pytorch][feature] add qwen3 235b sh --- .../pretrain_qwen3_235b_a22b_4k_A2_ptd.sh | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh diff --git a/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh b/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh new file mode 100644 index 0000000000..6337cb1e9c --- /dev/null +++ b/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export HCCL_IF_BASE_PORT=25809 +export CPU_AFFINITY_CONF=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_CONNECT_TIMEOUT=3600 +export TASK_QUEUE_ENABLE=2 +export HCCL_ALGO="alltoall=level0:NA;level1:pipeline" +export HCCL_BUFFSIZE=400 + + +NPUS_PER_NODE=16 +MASTER_ADDR=localhost +MASTER_PORT=60011 +NNODES=16 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=2 +PP=8 +VPP=6 +EP=8 +CP=1 +MBS=1 +GBS=1024 +CP_TYPE='megatron_cp_algo' +SEQ_LENGTH=4096 +TRAIN_ITERS=2000 +ROUTER_BALANCING_TYPE='aux_loss' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +MOE_ARGS=" + --num-experts 128 \ + --moe-router-topk 8 \ + --moe-ffn-hidden-size 1536 \ + --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ + --norm-topk-prob \ + --moe-grouped-gemm \ + --moe-token-dispatcher-type alltoall_seq \ + --moe-aux-loss-coeff 0.001 \ + --moe-permutation-async-comm \ + --moe-alltoall-overlap-comm \ + --moe-permute-fusion \ + --moe-tp-extend-ep \ +" + +OPTIMIZE_ARGS=" + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --no-masked-softmax-fusion \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --manual-gc \ + --manual-gc-interval 10 \ +" + +RECOMPUTE_ARGS=" + --recompute-granularity full \ + --recompute-method block \ + --recompute-num-layers 8 \ +" + +TRAIN_ARGS=" + --fix-router \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --lr 1.25e-6 \ + --lr-decay-style cosine \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --initial-loss-scale 4096 \ + --seed 42 \ + --bf16 \ + --train-iters ${TRAIN_ITERS} \ + --seq-length ${SEQ_LENGTH} \ + --no-shared-storage +" + +MODEL_PARALLEL_ARGS=" + --sequence-parallel \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --expert-model-parallel-size ${EP} \ + --num-layers-per-virtual-pipeline-stage ${VPP} \ +" + +GPT_ARGS=" + --kv-channels 128 \ + --spec mindspeed_llm.tasks.models.spec.qwen3_spec layer_spec \ + --qk-layernorm \ + --gemm-gradient-accumulation-fusion \ + --reuse-fp32-param \ + --use-mcore-models \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --noop-layers 94,95 \ + --num-layers 96 \ + --hidden-size 4096 \ + --ffn-hidden-size 12288 \ + --num-attention-heads 64 \ + --tokenizer-type PretrainedFromHF \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --swiglu \ + --attention-softmax-in-fp32 \ + --group-query-attention \ + --num-query-groups 4 \ + --no-shared-storage +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval ${TRAIN_ITERS} \ + --eval-interval ${TRAIN_ITERS} \ + --eval-iters 0 \ + --no-load-optim \ + --no-load-rng +" + + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $MOE_ARGS \ + $OUTPUT_ARGS \ + $OPTIMIZE_ARGS \ + $TRAIN_ARGS \ + $MODEL_PARALLEL_ARGS \ + $RECOMPUTE_ARGS \ + --save ${CKPT_SAVE_DIR} \ + --load ${CKPT_LOAD_DIR} \ + --distributed-backend nccl \ + | tee logs/train_mcore_qwen3_235b_4k_A2.log -- Gitee From a3a793f6c8dff1b674de0e6dd7349fb695672c93 Mon Sep 17 00:00:00 2001 From: mhh001 Date: Thu, 25 Sep 2025 15:46:58 +0800 Subject: [PATCH 2/3] [pytorch][feature] add qwen3 235b sh --- tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh b/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh index 6337cb1e9c..91133b780e 100644 --- a/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh +++ b/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh @@ -23,13 +23,13 @@ DATA_PATH="your data path" TOKENIZER_PATH="your tokenizer path" CKPT_LOAD_DIR="your model ckpt path" -TP=2 +TP=1 PP=8 VPP=6 EP=8 CP=1 MBS=1 -GBS=1024 +GBS=2048 CP_TYPE='megatron_cp_algo' SEQ_LENGTH=4096 TRAIN_ITERS=2000 -- Gitee From 22d74c4960ef3a8e229a86227913799322499c2e Mon Sep 17 00:00:00 2001 From: mhh001 Date: Thu, 25 Sep 2025 20:16:36 +0800 Subject: [PATCH 3/3] [pytorch][feature] add qwen3 235b sh --- tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh b/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh index 91133b780e..7abcbc4eb6 100644 --- a/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh +++ b/tests/poc/qwen3-moe/pretrain_qwen3_235b_a22b_4k_A2_ptd.sh @@ -29,7 +29,7 @@ VPP=6 EP=8 CP=1 MBS=1 -GBS=2048 +GBS=1024 CP_TYPE='megatron_cp_algo' SEQ_LENGTH=4096 TRAIN_ITERS=2000 -- Gitee