diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md index 96b2ea835449aa974223ee293cbca72ac158a0aa..d35048a2e3fe115d82f4fbc30adfcc686297fc51 100644 --- a/SECURITYNOTE.md +++ b/SECURITYNOTE.md @@ -47,6 +47,8 @@ 1. 建议用户结合运行资源状况编写对应训练脚本。若训练脚本与资源状况不匹配,如数据集加载内存大小超出内存容量限制、训练脚本在本地生成数据超过磁盘空间大小等情况,可能引发错误并导致进程意外退出。 2. MindSpeed-LLM内部用到了PyTorch,可能会因为版本不匹配导致运行错误,具体可参考PyTorch[安全声明](https://gitee.com/ascend/pytorch#%E5%AE%89%E5%85%A8%E5%A3%B0%E6%98%8E)。 3. 本软件使用PyTorch的torch.load做模型加载,代码中存在该接口使用场景配置参数weights_only=True,对于PyTorch版本<=2.5.1时,存在反序列化漏洞CVE-2025-32434,请用户保障所加载权重的安全性,避免恶意模型加载使执行机/设备遭到攻击。 +4. HumanEval使用了subprocess.run,存在安全风险,为了不影响功能正常使用,做了一些安全校验规避,请用户根据需要自行构建黑名单,完善安全问题。 +5. 因为安全问题,需要设置trust_remote_code=False,无法远程加载Transformer官方仓库未支持的开源模型,如需要,请手动配置--trust-remote-code参量。 ## 公网地址声明 diff --git a/configs/dangerous_shell.json b/configs/dangerous_shell.json new file mode 100644 index 0000000000000000000000000000000000000000..927219fe5391423e6dfca632613c3ba3aeaf0e8e --- /dev/null +++ b/configs/dangerous_shell.json @@ -0,0 +1,9 @@ +[ + r"os\.(system|popen|exec|setuid|setgid|chroot)\s*\(", + r"subprocess\.(run|Popen|call)\s*\(", + r"pty\.spawn\s*\(", + r"(requests|urllib|socket|httpx)\.(get|post|urlopen|connect)\s*\(", + r"open\s*\(", + r"os\.(remove|rename|chmod|chown|mkdir)\s*\(", + r"(eval|exec|__import__|globals|locals)\s*\(" +] \ No newline at end of file diff --git a/convert_ckpt.py b/convert_ckpt.py index 5cf14112b397b727c7fa23255fe01ddf3f01b852..dbd49ab194dc8a7824b30498f276b8ba726fa68f 100644 --- a/convert_ckpt.py +++ b/convert_ckpt.py @@ -79,6 +79,10 @@ def main(): parser.add_argument('--ckpt-format', default='torch', choices=['torch', 'torch_dist', 'zarr'], help='Checkpoint format to use.') + parser.add_argument('--trust-remote-code', + action='store_true', + default=False, + help='enable trust-remote-code for transformer to load model') known_args, _ = parser.parse_known_args() diff --git a/convert_ckpt_v2.py b/convert_ckpt_v2.py index e227590f0c767b43fae777abc89b2e5305ee5be5..4dfe1f8032d411ba5e629b3b554ada7cea8e2eb2 100644 --- a/convert_ckpt_v2.py +++ b/convert_ckpt_v2.py @@ -48,6 +48,10 @@ def get_args(): help='Customizing the number of dense layers.') parser.add_argument('--num-layers', type=int, default=None, help='Specify the number of transformer layers to use.') + parser.add_argument('--trust-remote-code', + action='store_true', + default=False, + help='enable trust remote code for transformer.from_ptretrain') args, _ = parser.parse_known_args() return args diff --git a/evaluation.py b/evaluation.py index 11d096f0f920d84453fc31e92f6816448157120e..9a6467301e87028b2e8150531c2cfe853200e4b7 100644 --- a/evaluation.py +++ b/evaluation.py @@ -378,7 +378,7 @@ def main(): model_provider=model_provider, pretrained_model_name_or_path=args.load ) - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path, trust_remote_code=True, local_files_only=True) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path, trust_remote_code=args.trust_remote_code, local_files_only=True) rank = dist.get_rank() if 'cmmlu' in args.task: diff --git a/examples/mcore/deepseek3/convert_ckpt_deepseek3.py b/examples/mcore/deepseek3/convert_ckpt_deepseek3.py index e99ed193d8c594b6716ddb6da8ed30720fe94894..dfa15e27a9990739c2577c0f6cb7d77eb5b83417 100644 --- a/examples/mcore/deepseek3/convert_ckpt_deepseek3.py +++ b/examples/mcore/deepseek3/convert_ckpt_deepseek3.py @@ -12,7 +12,7 @@ import safetensors import torch import safetensors.torch import bitsandbytes as bnb - +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logger.basicConfig(format="") logger.getLogger().setLevel(logger.INFO) @@ -75,8 +75,8 @@ class CkptConvert(object): self.vpp_stage = vpp_stage if vpp_stage is not None: self.vpp_size = self.num_layers // self.pp_size // self.vpp_stage - self.hf_model_path = hf_model_path - self.mg_save_path = mg_save_path + self.hf_model_path = standardize_path(hf_model_path, check_read=True) + self.mg_save_path = standardize_path(mg_save_path, check_write=True) self.num_layer_list = num_layer_list self.noop_layers = noop_layers self.moe_grouped_gemm = moe_grouped_gemm @@ -140,7 +140,7 @@ class CkptConvert(object): """megatron model path""" iter_mg_path = os.path.join(mg_path, "iter_0000001") if not os.path.exists(mg_path): - os.makedirs(mg_path, exist_ok=True) + os.makedirs(mg_path, mode=0o750, exist_ok=True) with open(os.path.join(mg_path, "latest_checkpointed_iteration.txt"), 'w') as f: f.write("1") @@ -788,7 +788,7 @@ class CkptConvert(object): for tp_rank in range(self.tp_size): save_prefix = self.generate_mg_weights_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank) parallel_save_path = os.path.join(save_model_path, save_prefix) - os.makedirs(parallel_save_path) + os.makedirs(parallel_save_path, mode=0o750, exist_ok=True) save_file_name = os.path.join(parallel_save_path, "model_optim_rng.pt") logger.info(f"Saving to {save_file_name}") @@ -847,7 +847,7 @@ class CkptConvert(object): for tp_rank in range(self.tp_size): save_prefix = self.generate_mg_weights_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank) parallel_save_path = os.path.join(save_model_path, save_prefix) - os.makedirs(parallel_save_path, exist_ok=True) + os.makedirs(parallel_save_path, mode=0o750, exist_ok=True) save_file_name = os.path.join(parallel_save_path, "model_optim_rng.pt") logger.info(f"Saving to {save_file_name}") model_dict = {"checkpoint_version": 3.0, "iteration": 1} diff --git a/examples/mcore/deepseek3/convert_ckpt_deepseek3_mcore2hf.py b/examples/mcore/deepseek3/convert_ckpt_deepseek3_mcore2hf.py index 648814fb7ed9efdaae461a9c7f4dfb24108ab335..d2b9922c661c233788703002034189a8cbe3b564 100644 --- a/examples/mcore/deepseek3/convert_ckpt_deepseek3_mcore2hf.py +++ b/examples/mcore/deepseek3/convert_ckpt_deepseek3_mcore2hf.py @@ -14,7 +14,7 @@ import tqdm import torch import torch_npu import safetensors.torch - +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logger.basicConfig(format="") logger.getLogger().setLevel(logger.INFO) @@ -73,15 +73,15 @@ class MgCkptConvert(object): self.ep_size = ep_size self.vpp_stage = vpp_stage - self.mg_model_path = mg_model_path - self.hf_save_path = hf_save_path + self.mg_model_path = standardize_path(mg_model_path, check_read=True) + self.hf_save_path = standardize_path(hf_save_path, check_write=True) self.lora_model_path = lora_model_path self.iter_path = self.get_iter_path(self.mg_model_path) if self.lora_model_path is not None: self.lora_iter_path = self.get_iter_path(self.lora_model_path) if not os.path.exists(self.hf_save_path): - os.makedirs(self.hf_save_path) + os.makedirs(self.hf_save_path, mode=0o750, exist_ok=True) self.num_layers = num_layers self.noop_layers = noop_layers @@ -194,7 +194,7 @@ class MgCkptConvert(object): directory = os.path.join(ckpt_path, f'iter_{iteration:07d}') - os.makedirs(directory, exist_ok=True) + os.makedirs(directory, mode=0o750, exist_ok=True) return directory diff --git a/examples/mindspore/deepseek3/convert_ckpt_deepseek3.py b/examples/mindspore/deepseek3/convert_ckpt_deepseek3.py index af47701c6ea042bf1a834ef8d2f1ebeb9723061a..32cf931d048d05532a8449a240c99155a5e3997c 100644 --- a/examples/mindspore/deepseek3/convert_ckpt_deepseek3.py +++ b/examples/mindspore/deepseek3/convert_ckpt_deepseek3.py @@ -146,7 +146,7 @@ class CkptConvert(object): """megatron model path""" iter_mg_path = os.path.join(mg_path, "iter_0000001") if not os.path.exists(mg_path): - os.makedirs(mg_path, exist_ok=True) + os.makedirs(mg_path, mode=0o750, exist_ok=True) with open(os.path.join(mg_path, "latest_checkpointed_iteration.txt"), 'w') as f: f.write("1") @@ -794,7 +794,7 @@ class CkptConvert(object): for tp_rank in range(self.tp_size): save_prefix = self.generate_mg_weights_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank) parallel_save_path = os.path.join(save_model_path, save_prefix) - os.makedirs(parallel_save_path) + os.makedirs(parallel_save_path, mode=0o750) save_file_name = os.path.join(parallel_save_path, "model_optim_rng.pt") logger.info(f"Saving to {save_file_name}") @@ -853,7 +853,7 @@ class CkptConvert(object): for tp_rank in range(self.tp_size): save_prefix = self.generate_mg_weights_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank) parallel_save_path = os.path.join(save_model_path, save_prefix) - os.makedirs(parallel_save_path, exist_ok=True) + os.makedirs(parallel_save_path, mode=0o750, exist_ok=True) save_file_name = os.path.join(parallel_save_path, "model_optim_rng.pt") logger.info(f"Saving to {save_file_name}") model_dict = {"checkpoint_version": 3.0, "iteration": 1} diff --git a/examples/mindspore/deepseek3/convert_ckpt_deepseek3_mcore2hf.py b/examples/mindspore/deepseek3/convert_ckpt_deepseek3_mcore2hf.py index bdd3a326bcf4978fc93708d32443c4a912079496..814e6cac29fc5b54e309655283a11bcbef0735f5 100644 --- a/examples/mindspore/deepseek3/convert_ckpt_deepseek3_mcore2hf.py +++ b/examples/mindspore/deepseek3/convert_ckpt_deepseek3_mcore2hf.py @@ -87,7 +87,7 @@ class MgCkptConvert(object): self.lora_iter_path = self.get_iter_path(self.lora_model_path) if not os.path.exists(self.hf_save_path): - os.makedirs(self.hf_save_path) + os.makedirs(self.hf_save_path, mode=0o750) self.num_layers = num_layers self.noop_layers = noop_layers @@ -200,7 +200,7 @@ class MgCkptConvert(object): directory = os.path.join(ckpt_path, f'iter_{iteration:07d}') - os.makedirs(directory, exist_ok=True) + os.makedirs(directory, mode=0o750, exist_ok=True) return directory diff --git a/mindspeed_llm/core/datasets/gpt_dataset.py b/mindspeed_llm/core/datasets/gpt_dataset.py index 63c59dba66b912733e7fcede160f62d923e33c37..1bf76fd937934c12384b3ba5ab1c44be202d8a5f 100644 --- a/mindspeed_llm/core/datasets/gpt_dataset.py +++ b/mindspeed_llm/core/datasets/gpt_dataset.py @@ -16,6 +16,7 @@ from megatron.core.datasets.gpt_dataset import (_build_document_index, _build_shuffle_index ) from mindspeed_llm.tasks.utils.error_utils import GPTDatasetSampleIndexError +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path from .blended_megatron_dataset_builder import need_to_build_dataset logger = logging.getLogger(__name__) @@ -70,6 +71,7 @@ def _build_document_sample_shuffle_indices( path_to_cache = os.path.join( self.dataset.path_prefix, "cache", f"{type(self).__name__}_indices" ) + path_to_cache = standardize_path(path_to_cache, check_write=True) # start of megatron_adaptation, # here we change from (class)GPTDataset._build_document_sample_shuffle_indices @@ -198,8 +200,7 @@ def _build_document_sample_shuffle_indices( ) if any(sample_index[:, 0] < 0): - _url = "https://gitee.com/ascend/MindSpeed-LLM/wikis/megatron%20data%20helpers%E5%8F%AF%E8%83%BD%E5%BC%95%E5%85%A5%E7%9A%84%E9%97%AE%E9%A2%98" - raise GPTDatasetSampleIndexError(f"Bad sample index. Visit {_url} for more information") + raise GPTDatasetSampleIndexError(f"Bad sample index.") # Build the shuffle index if separate_final_epoch: @@ -212,7 +213,7 @@ def _build_document_sample_shuffle_indices( ) if path_to_cache: - os.makedirs(path_to_cache, exist_ok=True) + os.makedirs(path_to_cache, mode=0o750, exist_ok=True) # Write the description with open(path_to_description, "wt") as writer: writer.write(self.unique_description) @@ -258,8 +259,7 @@ def _build_document_sample_shuffle_indices( sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r') if any(sample_index[:, 0] < 0): - _url = "https://gitee.com/ascend/MindSpeed-LLM/wikis/megatron%20data%20helpers%E5%8F%AF%E8%83%BD%E5%BC%95%E5%85%A5%E7%9A%84%E9%97%AE%E9%A2%98" - raise GPTDatasetSampleIndexError(f"Bad sample index. Visit {_url} for more information") + raise GPTDatasetSampleIndexError(f"Bad sample index.") t_end = time.time() log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds") @@ -345,10 +345,10 @@ def _get_ltor_masks_and_position_ids( i = eod_index[j] # Mask attention loss. if reset_attention_mask and attention_mask is not None: - attention_mask[0, (i + 1) :, : (i + 1)] = 0 + attention_mask[0, (i + 1):, :(i + 1)] = 0 # Reset positions. if reset_position_ids: - position_ids[(i + 1) :] -= i + 1 - prev_index + position_ids[(i + 1):] -= i + 1 - prev_index prev_index = i + 1 if attention_mask is not None: diff --git a/mindspeed_llm/core/distributed/finalize_model_grads.py b/mindspeed_llm/core/distributed/finalize_model_grads.py index 17a0c8605fdffb9c6c1c0ac2d243146e8a1e024e..4b7e41935b02b0c2e4417644021c347003a16cfa 100644 --- a/mindspeed_llm/core/distributed/finalize_model_grads.py +++ b/mindspeed_llm/core/distributed/finalize_model_grads.py @@ -36,12 +36,7 @@ def allreduce_layernorm_grads(model: List[torch.nn.Module], config: TransformerC for name, param in get_attr_wrapped_model(model_chunk, 'named_parameters')(): if not param.requires_grad: continue - if ( - param.requires_grad - and getattr(param, 'sequence_parallel', False) - or 'q_layernorm' in name - or 'k_layernorm' in name - ): + elif getattr(param, 'sequence_parallel', False) or 'q_layernorm' in name or 'k_layernorm' in name: grad = param.main_grad grads.append(grad.data) if grads: @@ -54,7 +49,7 @@ def allreduce_layernorm_grads(model: List[torch.nn.Module], config: TransformerC layer_norm_2d_grads = [] for model_chunk in model: - for name, param in get_attr_wrapped_model(model_chunk, "named_parameters")(): + for _, param in get_attr_wrapped_model(model_chunk, "named_parameters")(): if param.requires_grad and getattr(param, "2d_tp", False): layer_norm_2d_grad = param.main_grad layer_norm_2d_grads.append(layer_norm_2d_grad.data) diff --git a/mindspeed_llm/core/models/gpt/gpt_model.py b/mindspeed_llm/core/models/gpt/gpt_model.py index 070846b64f0f7e6ac77172ec464796e8b29526dd..196e7972311a4e95e1785ff56d06b09d35bab0b8 100644 --- a/mindspeed_llm/core/models/gpt/gpt_model.py +++ b/mindspeed_llm/core/models/gpt/gpt_model.py @@ -23,9 +23,8 @@ from megatron.core.utils import deprecate_inference_params from megatron.core.inference.contexts import BaseInferenceContext from megatron.training import get_args -from mindspeed_llm.core.tensor_parallel.layers import SegmentedColumnParallelLinear - from mindspeed.utils import get_actual_seq_len, compute_qkv_index, get_position_ids +from mindspeed_llm.core.tensor_parallel.layers import SegmentedColumnParallelLinear class GPTModel(MegatronCoreGPTModel): diff --git a/mindspeed_llm/core/parallel_state.py b/mindspeed_llm/core/parallel_state.py index 5a69b1cf913c0301728e3d217b8b092d4ee6a8c4..69d2c2c2b687a2bd16e16abecf5856bc40d6aeb3 100644 --- a/mindspeed_llm/core/parallel_state.py +++ b/mindspeed_llm/core/parallel_state.py @@ -26,6 +26,8 @@ from mindspeed.core.parallel_state import (initialize_context_parallel_group_for initialize_context_parallel_group_for_hybrid_cp, initialize_context_parallel_group_for_double_ring) +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path + _EXPERT_PARALLEL_GROUP = None _MPU_EXPERT_MODEL_PARALLEL_RANK = None _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = None @@ -58,6 +60,8 @@ def initialize_model_parallel_decorator(initialize_model_parallel): from megatron.training.utils import print_rank_0 timeout = timedelta(minutes=distributed_timeout_minutes) + nccl_communicator_config_path = standardize_path(nccl_communicator_config_path, check_read=True) + if pipeline_model_parallel_size == 2 and virtual_pipeline_model_parallel_size is not None: megatron.core.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0 megatron.core.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size diff --git a/mindspeed_llm/core/pipeline_parallel/dualpipe/adaptor.py b/mindspeed_llm/core/pipeline_parallel/dualpipe/adaptor.py index e6d9c09998520fad2036c35df27a77169b4ca35a..ad7b9f58bad7c650767a6db73b34a8378d5c8008 100644 --- a/mindspeed_llm/core/pipeline_parallel/dualpipe/adaptor.py +++ b/mindspeed_llm/core/pipeline_parallel/dualpipe/adaptor.py @@ -17,7 +17,7 @@ try: from mindspeed_llm.core.pipeline_parallel.dualpipe.gpt_model import gpt_model_forward_backward_overlaping from mindspeed_llm.core.pipeline_parallel.dualpipe.MTP_overlap import forward_overlap except ImportError: - pass + print("[warning] failed import dualpipe modules, not support dualpipe") def dualpipe_register_patches(MegatronAdaptation): diff --git a/mindspeed_llm/core/ssm/mamba_mixer.py b/mindspeed_llm/core/ssm/mamba_mixer.py index b2ab807a3baeef4e98270192efedddebfadc18d0..f710e3fa4767dcbe9472cec1ddca191a19e6a988 100644 --- a/mindspeed_llm/core/ssm/mamba_mixer.py +++ b/mindspeed_llm/core/ssm/mamba_mixer.py @@ -106,7 +106,7 @@ def mamba_mixer_forward(self, hidden_states, seqlen=None, seq_idx=None, cu_seqle # Compute short convolution if conv_state is not None: if cu_seqlens: - raise('Variable length inputs in convolution are not currently supported') + raise 'Variable length inputs in convolution are not currently supported' # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise. conv_state.copy_( @@ -115,7 +115,7 @@ def mamba_mixer_forward(self, hidden_states, seqlen=None, seq_idx=None, cu_seqle seqlen = xBC.size(2) if seq_idx: - raise('Variable length inputs in convolution are not currently supported') + raise 'Variable length inputs in convolution are not currently supported' xBC = self.act(self.conv1d(xBC)[..., :seqlen]) # transpose b pd l --> b l pd diff --git a/mindspeed_llm/core/transformer/moe/layers.py b/mindspeed_llm/core/transformer/moe/layers.py index 35d942a4d3e0a0bb4ef1f46d0731f69e3b3f4e0e..cefa27e79e0cb2829611ac757de98ea2a161bff2 100644 --- a/mindspeed_llm/core/transformer/moe/layers.py +++ b/mindspeed_llm/core/transformer/moe/layers.py @@ -96,7 +96,7 @@ class SEColumnParallelLinear(megatron.core.tensor_parallel.ColumnParallelLinear) ) if self.config._cpu_offloading_context is not None: - if self.config._cpu_offloading_context.inside_context == True: + if self.config._cpu_offloading_context.inside_context: assert ( self.config.cpu_offloading == False ), "CPU Offloading cannot be enabled while using non-TE modules" @@ -197,7 +197,7 @@ class SERowParallelLinear(megatron.core.tensor_parallel.RowParallelLinear): """ if self.config._cpu_offloading_context is not None: - if self.config._cpu_offloading_context.inside_context == True: + if self.config._cpu_offloading_context.inside_context: assert ( self.config.cpu_offloading == False ), "CPU Offloading cannot be enabled while using non-TE modules" diff --git a/mindspeed_llm/core/transformer/moe/router.py b/mindspeed_llm/core/transformer/moe/router.py index 29c342448aac9ff2bc5954ac533b232d2ba24d09..4a919cfd5eca18e1b29bcc3404307c706dd36959 100644 --- a/mindspeed_llm/core/transformer/moe/router.py +++ b/mindspeed_llm/core/transformer/moe/router.py @@ -399,8 +399,10 @@ def apply_seq_aux_loss(self, activation, logits, topk_idx): scores_for_aux = scores # [s*b, n_global_experts] topk_idx_for_aux_loss = topk_idx.view(args.micro_batch_size, -1) # [b, s*top_k] scores_for_seq_aux = scores_for_aux.view(args.micro_batch_size, seq_length, -1) - ce = torch.stack([torch.histc(x.to(torch.int32), bins=args.num_experts, min=0, max=args.num_experts) for x in - topk_idx_for_aux_loss]) + ce = torch.stack([ + torch.histc(x.to(torch.int32), bins=args.num_experts, min=0, max=args.num_experts) + for x in topk_idx_for_aux_loss + ]) num_sub_sequence = 1 sequence_partition_group = parallel_state.get_context_parallel_group() diff --git a/mindspeed_llm/core/transformer/multi_token_prediction.py b/mindspeed_llm/core/transformer/multi_token_prediction.py index 194ce49b9e8db4e02ae4db3fa1479f1ee8cc9342..10939db33c23b1562a22a8ea8f79f9cf8f7d14f9 100644 --- a/mindspeed_llm/core/transformer/multi_token_prediction.py +++ b/mindspeed_llm/core/transformer/multi_token_prediction.py @@ -227,7 +227,7 @@ def mtp_block_forward( embedding.word_embeddings.weight = get_shared_embedding_from_dual_chunk() hidden_states_main_model = hidden_states - for layer_number in range(len(self.layers)): + for layer_number, _ in enumerate(self.layers): # get input_data from mtp_batch_list or not input_ids, position_ids, labels, loss_mask, attention_mask = get_mtp_layer_input( (input_ids, position_ids, labels, loss_mask, attention_mask), mtp_batch_list, layer_number) diff --git a/mindspeed_llm/features_manager/arguments/deprecated_args.py b/mindspeed_llm/features_manager/arguments/deprecated_args.py index 98cf8d52bff4293281533cda8e2fe321745f1dc3..618aefc5a6d3038ad1131d966de8931da150a7a9 100644 --- a/mindspeed_llm/features_manager/arguments/deprecated_args.py +++ b/mindspeed_llm/features_manager/arguments/deprecated_args.py @@ -50,6 +50,10 @@ class DeprecatedArgsFeature(MindSpeedFeature): group.add_argument('--rope-scaling-beta-slow', type=int, default=None, dest='deprecated_rope_scaling_beta_slow', help='Yarn rope: rope beta slow' 'Note: this option is deprecated, please use --beta-slow instead!') + group.add_argument('--trust-remote-code', + action='store_true', + default=False, + help='enable trust remote code for transformer.from_ptretrain') def validate_args(self, args): # If deprecated argument are used instead of new argument, we assign the deprecated argument to the new argument and issue a warning @@ -58,6 +62,11 @@ class DeprecatedArgsFeature(MindSpeedFeature): """The '--use-deter-comp' argument is deprecated and will be removed in the next future version, please use '--npu-deterministic' instead!""", DeprecationWarning) args.npu_deterministic = args.deprecated_use_deter_comp + if args.trust_remote_code: + print("""The '--trust-remote-code' argument is not safe, please be careful!!!""") + else: + print( + """The '--trust-remote-code' is not be set, some models will be failed to load from transformers!!!""") if args.deprecated_use_mc2 and not args.use_ascend_mc2: warnings.warn( """The '--use-mc2' argument is deprecated and will be removed in the next future version, diff --git a/mindspeed_llm/legacy/model/transformer.py b/mindspeed_llm/legacy/model/transformer.py index fcefd0b2de7296feb07047c12ee16ae802994497..55ac368a8ab6c72daa1bcb50862b3879b44535a6 100644 --- a/mindspeed_llm/legacy/model/transformer.py +++ b/mindspeed_llm/legacy/model/transformer.py @@ -487,16 +487,16 @@ class FlashSelfAttention(torch.nn.Module): if q.shape[1] == 1 and q.shape[1] != seq_length: output = torch_npu.npu_incre_flash_attention( \ q, k, v, \ - num_heads=head_num, + num_heads=head_num, input_layout="BSH", \ pse_shift=pse, \ padding_mask=None, \ - scale_value=scale, + scale_value=scale, ) else: output = torch_npu.npu_prompt_flash_attention( \ q, k, v, \ - num_heads=head_num, + num_heads=head_num, input_layout="BSH", \ pse_shift=pse, \ sparse_mode=sparse_mode, \ @@ -520,7 +520,7 @@ class FlashSelfAttention(torch.nn.Module): keep_prob=1 - self.dropout_p, \ inner_precise=0 )[0] - + return output @@ -763,7 +763,7 @@ def ParallelAttentionForward(self, hidden_states, attention_mask, # In inference, we compute one token at a time. # Select the correct positional embedding # (only the last token in the sequence) - q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end] + q_pos_emb = q_pos_emb[sequence_end - 1: sequence_end] else: # In the first forward pass of inference, # we use the entire provided prefix. diff --git a/mindspeed_llm/tasks/checkpoint/convert.py b/mindspeed_llm/tasks/checkpoint/convert.py index 0f054757fa21e0160182c1ea69a7b35dd758c99d..1ebb44727869121c8e5b4f6611fbdcf3e3703e63 100644 --- a/mindspeed_llm/tasks/checkpoint/convert.py +++ b/mindspeed_llm/tasks/checkpoint/convert.py @@ -44,7 +44,7 @@ class Convert(abc.ABC): """megatron model path""" iter_mg_path = os.path.join(mg_path, "iter_0000001") if not os.path.exists(mg_path): - os.makedirs(mg_path, exist_ok=True) + os.makedirs(mg_path, mode=0o750, exist_ok=True) with open(os.path.join(mg_path, "latest_checkpointed_iteration.txt"), 'w') as f: f.write("1") return iter_mg_path diff --git a/mindspeed_llm/tasks/checkpoint/convert_ckpt_mamba2.py b/mindspeed_llm/tasks/checkpoint/convert_ckpt_mamba2.py index a889b86a1c3995a0de1f035bf4d91016eb048e49..7346deed43f0c362d0b92cabdb9f775ba07cbd40 100644 --- a/mindspeed_llm/tasks/checkpoint/convert_ckpt_mamba2.py +++ b/mindspeed_llm/tasks/checkpoint/convert_ckpt_mamba2.py @@ -8,6 +8,7 @@ import logging as logger import argparse import torch import safetensors.torch +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logger.basicConfig(format="") logger.getLogger().setLevel(logger.INFO) @@ -67,7 +68,7 @@ class CheckpointConverter: try: if filename.endswith(".bin"): - cur_weights = torch.load(file_path, map_location=torch.device('cpu')) + cur_weights = torch.load(file_path, map_location=torch.device('cpu'), weights_only=False) model_dict.update(cur_weights) print(f"Successfully loaded: {filename}") loaded = True @@ -475,7 +476,7 @@ class CheckpointConverter: dir_name += f"_{pp_idx:03d}" save_path = os.path.join(args.save_dir, f"iter_{out_iteration:07d}", dir_name) - os.makedirs(save_path, exist_ok=True) + os.makedirs(save_path, mode=0o750, exist_ok=True) return os.path.join(save_path, filename) @@ -589,6 +590,8 @@ def run(): args, _ = parser.parse_known_args() + args.load_dir = standardize_path(args.load_dir, check_read=True) + converter = CheckpointConverter(args) converter.main() diff --git a/mindspeed_llm/tasks/checkpoint/convert_hf2mg.py b/mindspeed_llm/tasks/checkpoint/convert_hf2mg.py index b385a78d18b7e10a2dbb0f99142e9e82e4c5d290..1ee6f67d69a069d9c06e30b4bf2c521d5b217259 100644 --- a/mindspeed_llm/tasks/checkpoint/convert_hf2mg.py +++ b/mindspeed_llm/tasks/checkpoint/convert_hf2mg.py @@ -922,7 +922,7 @@ class Hf2MgConvert(Convert): continue save_prefix = self.generate_mg_weights_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank) parallel_save_path = os.path.join(self.save_dir, save_prefix) - os.makedirs(parallel_save_path, exist_ok=True) + os.makedirs(parallel_save_path, mode=0o750, exist_ok=True) save_file_name = os.path.join(parallel_save_path, "model_optim_rng.pt") logger.info(f"Saving to {save_file_name}") @@ -984,7 +984,7 @@ class Hf2MgConvert(Convert): continue save_prefix = self.generate_mg_weights_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank) parallel_save_path = os.path.join(self.save_dir, save_prefix) - os.makedirs(parallel_save_path, exist_ok=True) + os.makedirs(parallel_save_path, mode=0o750, exist_ok=True) save_file_name = os.path.join(parallel_save_path, "model_optim_rng.pt") logger.info(f"Saving to {save_file_name}") model_dict = {"args" : args, "checkpoint_version" : 3.0, "iteration" : 1} diff --git a/mindspeed_llm/tasks/checkpoint/convert_mg2hf.py b/mindspeed_llm/tasks/checkpoint/convert_mg2hf.py index 37d296aa1de7cc1c6afb80f2460f8104e09135d5..4f4e94460bb727aedd0848553f98ad3bbd16578e 100644 --- a/mindspeed_llm/tasks/checkpoint/convert_mg2hf.py +++ b/mindspeed_llm/tasks/checkpoint/convert_mg2hf.py @@ -46,7 +46,7 @@ class Mg2HfConvert(Convert): if not os.path.exists(self.save_dir): - os.makedirs(self.save_dir) + os.makedirs(self.save_dir, mode=0o750) self.tensor_model_parallel_size = self.load_model.tensor_model_parallel_size self.pipeline_model_parallel_size = self.load_model.pipeline_model_parallel_size @@ -126,7 +126,7 @@ class Mg2HfConvert(Convert): directory = os.path.join(ckpt_path, f'iter_{iteration:07d}') - os.makedirs(directory, exist_ok=True) + os.makedirs(directory, mode=0o750, exist_ok=True) return directory diff --git a/mindspeed_llm/tasks/checkpoint/convert_param.py b/mindspeed_llm/tasks/checkpoint/convert_param.py index c3ef16bd562e25919c360bfd9e3129a5791507fb..9158525ba46fb0540a05fd3d92cdefc9ffb6c677 100644 --- a/mindspeed_llm/tasks/checkpoint/convert_param.py +++ b/mindspeed_llm/tasks/checkpoint/convert_param.py @@ -20,6 +20,7 @@ import stat import time import torch +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path def get_json_from_file(json_file): @@ -123,10 +124,11 @@ class ConvertBase: self.mg_latest_ckpt_file_name = "latest_checkpointed_iteration.txt" # hf model index_file - self.model_index_file = os.path.join( - self.args_cmd.hf_dir, - "pytorch_model.bin.index.json") if self.args_cmd.model_index_file is None \ + index_file = os.path.join(self.args_cmd.hf_dir, "pytorch_model.bin.index.json") + self.model_index_file = index_file if self.args_cmd.model_index_file is None \ else self.args_cmd.model_index_file + self.model_index_file = standardize_path(self.model_index_file, check_read=True) + self.model_index_map = get_json_from_file(self.model_index_file) # hf model config_file self.config_file = os.path.join( @@ -550,17 +552,18 @@ class ConvertHf2Mg(ConvertBase): ep_rank=ep_rank) save_dir = self.get_mg_model_save_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=ep_rank, iteration=iteration) - os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir, mode=0o750, exist_ok=True) torch.save(model_dict, os.path.join(save_dir, self.mg_model_file_name)) else: # Dense Model model_dict = self._set_dense_mg_model(hf_model=hf_model, tp_rank=tp_rank, pp_rank=pp_rank) save_dir = self.get_mg_model_save_dir(tp_rank=tp_rank, pp_rank=pp_rank, ep_rank=None, iteration=iteration) - os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir, mode=0o750, exist_ok=True) torch.save(model_dict, os.path.join(save_dir, self.mg_model_file_name)) # write latest_checkpointed_iteration.txt latest_ckpt_file_path = os.path.join(self.args_cmd.mg_dir, self.mg_latest_ckpt_file_name) + latest_ckpt_file_path = standardize_path(latest_ckpt_file_path, check_write=True) modes = stat.S_IWUSR | stat.S_IRUSR | stat.S_IWGRP | stat.S_IRGRP with os.fdopen(os.open(latest_ckpt_file_path, flags=os.O_RDWR | os.O_CREAT, mode=modes), 'w') as fout: fout.write(iteration) @@ -757,7 +760,7 @@ class ConvertMg2Hf(ConvertBase): if self.get_hf_model_file_based_param_key(param_key) == model_file: exist_model[param_key] = hf_model[param_key] - os.makedirs(os.path.dirname(file_path), exist_ok=True) + os.makedirs(os.path.dirname(file_path), mode=0o750, exist_ok=True) torch.save(exist_model, file_path) def run(self): diff --git a/mindspeed_llm/tasks/checkpoint/loader_hf.py b/mindspeed_llm/tasks/checkpoint/loader_hf.py index 0d6e50fcacfb1966bfd8a95d15760e9b52bf954e..cd84e11d743bd6a43e3d666d9f839ba62a069640 100644 --- a/mindspeed_llm/tasks/checkpoint/loader_hf.py +++ b/mindspeed_llm/tasks/checkpoint/loader_hf.py @@ -376,7 +376,7 @@ def _load_checkpoint(model_provider, queue, args): md = build_metadata(args, margs) queue.put(md) - model_hf.get_modules_from_pretrained() + model_hf.get_modules_from_pretrained(trust_remote_code=args.trust_remote_code) model_mg.get_modules_from_config() model_mg.update_module(model_hf) diff --git a/mindspeed_llm/tasks/checkpoint/models.py b/mindspeed_llm/tasks/checkpoint/models.py index efcdc498b78c3a5e0a93b58d9dba22885292e5ba..69f0ae9d793ae4280897cafeef4c9d000370ffab 100644 --- a/mindspeed_llm/tasks/checkpoint/models.py +++ b/mindspeed_llm/tasks/checkpoint/models.py @@ -2,6 +2,7 @@ import abc import os import sys +import ast import re import json from types import SimpleNamespace @@ -23,6 +24,7 @@ from megatron.core import tensor_parallel from mindspeed_llm.training.utils import parse_args from mindspeed_llm.training.training import model_provider_func_wrapper from mindspeed_llm.training.checkpointing import load_checkpoint_wrapper +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logger.basicConfig(format="") logger.getLogger().setLevel(logger.INFO) @@ -468,8 +470,10 @@ class HuggingfaceModel(ModelBase): # Read huggingface args. if self.args_cmd.save_model_type == 'hf': cfg_dir = self.args_cmd.save_dir + cfg_dir = standardize_path(self.args_cmd.save_dir, check_write=True) else: cfg_dir = self.args_cmd.load_dir + cfg_dir = standardize_path(self.args_cmd.load_dir, check_write=True) llama_args_path = os.path.join(cfg_dir, "config.json") with open(llama_args_path) as f: self.args = json.load(f) @@ -503,7 +507,7 @@ class HuggingfaceModel(ModelBase): self.args.save_lora_to_hf = self.args_cmd.save_lora_to_hf self.args.noop_layers = self.args_cmd.noop_layers - def get_modules_from_config(self, device_map="cpu", trust_remote_code=True): + def get_modules_from_config(self, device_map="cpu", trust_remote_code=False): # Load Huggingface model. if self.args_cmd.save_model_type == "hf": load_dir = self.args_cmd.save_dir @@ -515,9 +519,9 @@ class HuggingfaceModel(ModelBase): hf_model.to_empty(device=device_map) self.module = [hf_model] if hasattr(self.args, "torch_dtype") and self.args.torch_dtype in ["float16", "bfloat16"]: - self.module[0] = self.module[0].to(eval(f'torch.{self.args.torch_dtype}')) + self.module[0] = self.module[0].to(ast.literal_eval(f'torch.{self.args.torch_dtype}')) - def get_modules_from_pretrained(self, device_map="cpu", trust_remote_code=True): + def get_modules_from_pretrained(self, device_map="cpu", trust_remote_code=False): # Load Huggingface model. if self.args_cmd.save_model_type == "hf": load_dir = self.args_cmd.save_dir @@ -544,7 +548,8 @@ class HuggingfaceModel(ModelBase): ) self.module = [get_peft_model(self.module[0], lora_config)] if hasattr(self.args, "torch_dtype") and self.args.torch_dtype in ["float16", "bfloat16"]: - self.module[0] = self.module[0].to(eval(f'torch.{self.args.torch_dtype}')) + dtype = getattr(torch, self.args.torch_dtype) + self.module[0] = self.module[0].to(dtype) def get_lora_key(self, layer_name, prefix): return f"{layer_name}.{prefix}" diff --git a/mindspeed_llm/tasks/checkpoint/saver.py b/mindspeed_llm/tasks/checkpoint/saver.py index 8d23ad83c38f3a811ed53994247d448c994a7d9d..d6dbabe1354ba2807936a7abe1b8e3cda80cab85 100644 --- a/mindspeed_llm/tasks/checkpoint/saver.py +++ b/mindspeed_llm/tasks/checkpoint/saver.py @@ -542,9 +542,9 @@ def save_huggingface(args, model): from .models import get_huggingface_model model_hf = get_huggingface_model(args) if args.load_hf_from_config: - model_hf.get_modules_from_config() + model_hf.get_modules_from_config(trust_remote_code=args.trust_remote_code) else: - model_hf.get_modules_from_pretrained() + model_hf.get_modules_from_pretrained(trust_remote_code=args.trust_remote_code) args_cmd = model_hf.get_args_cmd() model_hf.update_module(model) diff --git a/mindspeed_llm/tasks/evaluation/eval_impl/agi_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/agi_eval.py index 31beea4ae66512ac2d30205f6d2c968c24f54fe6..b75d5039f7fdb3e0b04c3449c45e59308abf5256 100644 --- a/mindspeed_llm/tasks/evaluation/eval_impl/agi_eval.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/agi_eval.py @@ -34,7 +34,7 @@ from mindspeed_llm.tasks.evaluation.eval_utils.agi_utils import ( get_default_instruction, get_pred_postprocess_func ) - +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logger = logging.getLogger(__name__) @@ -42,7 +42,7 @@ class AGIEvalExam(DatasetEval): def __init__(self, test_dir, eval_args, instruction_template="{fewshot_template} {question}\n{question_template}\n{options}" "\n{answer_template}"): - self.test_dir = test_dir + self.test_dir = standardize_path(test_dir, check_read=True) self.instruction_template = instruction_template self.batch_size = eval_args.evaluation_batch_size self.rank = dist.get_rank() diff --git a/mindspeed_llm/tasks/evaluation/eval_impl/bbh_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/bbh_eval.py index 6f5cc3e3546ca6577f07516b92b70e6a60d89969..fe9d32aa264a2531a8a7be2298d5e754cdcbb025 100644 --- a/mindspeed_llm/tasks/evaluation/eval_impl/bbh_eval.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/bbh_eval.py @@ -32,7 +32,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.dataset_eval import DatasetEval from mindspeed_llm.tasks.evaluation.eval_impl.template import BBH_TEMPLATE_DIR, BBH_COT_TEMPLATE_DIR, get_eval_template from mindspeed_llm.tasks.evaluation.eval_utils.bbh_utils import bbh_mcq_postprocess, bbh_freeform_postprocess, bbh_true_or_false_questions from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset - +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logger = logging.getLogger(__name__) @@ -75,7 +75,7 @@ bbh_free_form_sets = [ class BBHEval(DatasetEval): def __init__(self, test_dir, eval_args, instruction_template="{fewshot_template}Q: {question}\nA:"): - self.test_dir = test_dir + self.test_dir = standardize_path(test_dir, check_read=True) self.instruction_template = instruction_template self.batch_size = eval_args.evaluation_batch_size self.rank = dist.get_rank() @@ -169,7 +169,7 @@ class BBHEval(DatasetEval): result_mapping = {value.strip(): key for key, value in re.findall(r'\(([A-Z])\)\s*([^\(\)]+)', instruction[-1][:answer_idx])} elif args.chain_of_thought: instruction = bbh_template.get(subject_name) - target_question = "Q: " + item['input'] + target_question = "Q: " + item['input'] # item['input'] is not path-info instruction += target_question instruction += "\nA: Let's think step by step." instructions.append(instruction) diff --git a/mindspeed_llm/tasks/evaluation/eval_impl/ceval_exam.py b/mindspeed_llm/tasks/evaluation/eval_impl/ceval_exam.py index 847f7b3b19768aede95b50ed4328b5e2f1fad329..44fa435f7e8e50da747018b9bbdc7c266d7958c3 100644 --- a/mindspeed_llm/tasks/evaluation/eval_impl/ceval_exam.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/ceval_exam.py @@ -31,7 +31,7 @@ from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero from mindspeed_llm.tasks.evaluation.eval_utils.ceval_utils import format_ceval_templates, first_capital_postprocess from mindspeed_llm.tasks.evaluation.utils import get_final_dataset from mindspeed_llm.tasks.evaluation.eval_impl.template import CEVAL_TEMPLATE_DIR, get_eval_template - +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logger = logging.getLogger(__name__) @@ -39,7 +39,7 @@ logger = logging.getLogger(__name__) class CEvalExam(DatasetEval): def __init__(self, test_dir, eval_args, instruction_template="{fewshot_template}\n\n问:{question}\n答:"): - self.test_dir = test_dir + self.test_dir = standardize_path(test_dir, check_read=True) self.instruction_template = instruction_template self.batch_size = eval_args.evaluation_batch_size self.rank = dist.get_rank() diff --git a/mindspeed_llm/tasks/evaluation/eval_impl/cmmlu_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/cmmlu_eval.py index 480fc1e3ea6b6303edc802ac34c992b5861bea13..542e124974692e894fbe4e1a3ab4254d8eaa256d 100644 --- a/mindspeed_llm/tasks/evaluation/eval_impl/cmmlu_eval.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/cmmlu_eval.py @@ -31,6 +31,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero from mindspeed_llm.tasks.evaluation.eval_utils.cmmlu_utils import cmmlu_subject_mapping, first_option_postprocess, cmmlu_format_example from mindspeed_llm.tasks.evaluation.utils import get_final_dataset +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path from .template import CMMLU_TEMPLATE_DIR, get_eval_template @@ -43,7 +44,7 @@ class CmmluEval(DatasetEval): "{question}\n答案: ", output_template1=r".*(?P<答案>[A|B|C|D])\..*", output_template2=r"(?P<答案>[A|B|C|D])"): - self.test_dir = test_dir + self.test_dir = standardize_path(test_dir, check_read=True) self.instruction_template = instruction_template self.output_template = [output_template1, output_template2] self.batch_size = eval_args.evaluation_batch_size diff --git a/mindspeed_llm/tasks/evaluation/eval_impl/gsm8k_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/gsm8k_eval.py index 6417dffd14324489c7f1c0962e7fae45f64d5f85..0277bf37e01463eaadcc2030b2aa695bb5cbe1d8 100644 --- a/mindspeed_llm/tasks/evaluation/eval_impl/gsm8k_eval.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/gsm8k_eval.py @@ -30,6 +30,7 @@ from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero from mindspeed_llm.tasks.evaluation.eval_utils.gsm8k_utils import four_shots_prompt, gsm8k_postprocess from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset from mindspeed_llm.tasks.evaluation.eval_impl.template import GSM8K_TEMPLATE_DIR +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logger = logging.getLogger(__name__) @@ -38,7 +39,7 @@ class Gsm8kEval(DatasetEval): def __init__(self, test_dir, eval_args, instruction_template="{fewshot_template}\n\n{question}", output_template=r'The answer is (.*?) '): - self.test_dir = test_dir + self.test_dir = standardize_path(test_dir, check_read=True) self.instruction_template = instruction_template self.output_template = output_template self.batch_size = eval_args.evaluation_batch_size diff --git a/mindspeed_llm/tasks/evaluation/eval_impl/hellaswag_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/hellaswag_eval.py index 6dff8f53a208652e6bb8da791529310bdc7f0a84..e01fa0e8f43ed7ae01b5f51946dbf4672b84a038 100644 --- a/mindspeed_llm/tasks/evaluation/eval_impl/hellaswag_eval.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/hellaswag_eval.py @@ -18,7 +18,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero from mindspeed_llm.tasks.evaluation.eval_utils.mmlu_utils import postprocess from mindspeed_llm.tasks.evaluation.utils import get_final_list_dataset - +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logger = logging.getLogger(__name__) @@ -27,7 +27,7 @@ class HellaswagEval(DatasetEval): def __init__(self, test_dir, eval_args, output_template1=r".*(?P[A|B|C|D])\..*", output_template2=r"(?P[A|B|C|D])"): - self.test_dir = test_dir + self.test_dir = standardize_path(test_dir, check_read=True) self.output_template = [output_template1, output_template2] self.instruction_template = ('{ctx}\nQuestion: Which ending makes the most sense?\n' 'A. {A}\nB. {B}\nC. {C}\nD. {D}\n' diff --git a/mindspeed_llm/tasks/evaluation/eval_impl/human_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/human_eval.py index ef5a20397262862072570115be5e84e605fd08e3..0f6bf2044e617a9eac6e66d7f299243b2344c44f 100644 --- a/mindspeed_llm/tasks/evaluation/eval_impl/human_eval.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/human_eval.py @@ -18,6 +18,7 @@ import os import logging import re import sys +import ast import subprocess from typing import Iterable, Dict import pandas as pd @@ -33,16 +34,55 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero from mindspeed_llm.training.utils import WRITE_FILE_DEFAULT_FLAGS, WRITE_FILE_DEFAULT_MODES from mindspeed_llm.tasks.evaluation.eval_utils.human_utils import humaneval_postprocess, get_score - +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logger = logging.getLogger(__name__) +def is_code_dangerous(code: str, dangerous_patterns) -> bool: + """AST 检测提权、外联、文件篡改""" + + # 正则检测(快速过滤) + for pattern in dangerous_patterns: + if re.search(pattern, code): + return True + + # AST 语义分析(防绕过) + try: + tree = ast.parse(code) + for node in ast.walk(tree): + if isinstance(node, ast.Call): + if isinstance(node.func, ast.Name): + if node.func.id in ("exec", "eval", "open", "os", "subprocess"): + return True + elif isinstance(node, (ast.Import, ast.ImportFrom)): + for alias in node.names: + if alias.name in ("os", "sys", "subprocess"): + return True + + # 检测 os.system("sudo ...") + if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute) and node.func.attr == "system": + return True + if any(isinstance(arg, ast.Str) and ("sudo" in arg.s or "curl" in arg.s) for arg in node.args): + return True + # 检测动态导入(如 __import__("os").system(...)) + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "__import__": + return True + + return False + except SyntaxError: + return True # 语法错误视为危险 + + def extract_answer_code(answer, task: dict): """ :param answer: :param task: :return: """ + + if is_code_dangerous(answer, self.dangerous_patterns) or is_code_dangerous(task["test"], self.dangerous_patterns): + raise ValueError("Unsafe code detected") + task_id = task['task_id'] target_func = task['entry_point'] test_case = task['test'] @@ -51,7 +91,7 @@ def extract_answer_code(answer, task: dict): code_lines = code.split("\n") target_func_flag = False if not os.path.exists(CODE_TEST_LOG_DIR): - os.makedirs(CODE_TEST_LOG_DIR) + os.makedirs(CODE_TEST_LOG_DIR, mode=0o750, exist_ok=True) test_code_path = "{}/{}".format(CODE_TEST_LOG_DIR, save_file) with os.fdopen(os.open(test_code_path, WRITE_FILE_DEFAULT_FLAGS, WRITE_FILE_DEFAULT_MODES), 'w') as f: f.write("from typing import List\n") @@ -85,7 +125,7 @@ def extract_answer_code(answer, task: dict): class HumanEval(DatasetEval): def __init__(self, test_dir, eval_args): - self.test_dir = test_dir + self.test_dir = standardize_path(test_dir, check_read=True) instruction_template = eval_args.instruction_template if instruction_template: self.instruction_template = instruction_template @@ -96,6 +136,10 @@ class HumanEval(DatasetEval): self.file_pbar = None self.task_pbar = None self.prompt = 'Complete the following python code:\n{prompt}' + self.dangerous_patterns = [] + with open("configs/dangerous_shell.json", "r", encoding="utf-8") as f: + self.dangerous_patterns = json.load(f) + print(self.dangerous_patterns) def read_problems(self) -> Dict[str, Dict]: return {task["task_id"]: task for task in self.stream_jsonl(self.test_dir)} diff --git a/mindspeed_llm/tasks/evaluation/eval_impl/mmlu_eval.py b/mindspeed_llm/tasks/evaluation/eval_impl/mmlu_eval.py index 05867319d42814918f40d34f332a0dfb746710a8..4013a7957c0f53726dd2c1f8e8ccb9c944886fd1 100644 --- a/mindspeed_llm/tasks/evaluation/eval_impl/mmlu_eval.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/mmlu_eval.py @@ -30,6 +30,7 @@ from mindspeed_llm.tasks.evaluation.eval_api.chat import Chat from mindspeed_llm.tasks.utils.error_utils import check_divisible_by_zero from mindspeed_llm.tasks.evaluation.eval_utils.mmlu_utils import _format_example, postprocess from mindspeed_llm.tasks.evaluation.utils import get_final_dataset +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path from .template import MMLU_TEMPLATE_DIR, get_eval_template @@ -42,7 +43,7 @@ class MmluEval(DatasetEval): "{question}\nAnswer:", output_template1=r".*(?P[A|B|C|D])\..*", output_template2=r"(?P[A|B|C|D])"): - self.test_dir = test_dir + self.test_dir = standardize_path(test_dir, check_read=True) self.instruction_template = instruction_template self.output_template = [output_template1, output_template2] self.batch_size = eval_args.evaluation_batch_size diff --git a/mindspeed_llm/tasks/evaluation/eval_impl/needlebench_single.py b/mindspeed_llm/tasks/evaluation/eval_impl/needlebench_single.py index 22a923c79eca547b4048039ad348429d62290dce..22baf1d08c2c2a91948ad18a4a0c2b527c9e1a97 100644 --- a/mindspeed_llm/tasks/evaluation/eval_impl/needlebench_single.py +++ b/mindspeed_llm/tasks/evaluation/eval_impl/needlebench_single.py @@ -157,7 +157,7 @@ def load_datasets(config: NeedleBenchConfig): dataset = [] needle_file_path = os.path.join(config.dataset_dir, "needles.jsonl") cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache") - os.makedirs(cache_dir, exist_ok=True) + os.makedirs(cache_dir, mode=0o750, exist_ok=True) cache_file = "9b5ad71b2ce5302211f9c61530b329a4922fc6a4" dest_filepath = os.path.join(cache_dir, cache_file) # 参考docs中的needlebench-evaluation.md下载cl100k_base.tiktoken,将其放置到该数据集文件夹 diff --git a/mindspeed_llm/tasks/evaluation/eval_utils/human_utils.py b/mindspeed_llm/tasks/evaluation/eval_utils/human_utils.py index 4b29bd2c73e942bc06de06273e69ea6044c15a98..3872dac41453fa9500f816c836fa6796519d89db 100644 --- a/mindspeed_llm/tasks/evaluation/eval_utils/human_utils.py +++ b/mindspeed_llm/tasks/evaluation/eval_utils/human_utils.py @@ -70,7 +70,7 @@ def check_correctness(problem: Dict, completion: str, timeout: float, rmtree = shutil.rmtree rmdir = os.rmdir - chdir = os.chdir + os_chdir = os.chdir reliability_guard() @@ -80,8 +80,8 @@ def check_correctness(problem: Dict, completion: str, timeout: float, f"check({problem['entry_point']})" ) + exec_globals = {} try: - exec_globals = {} with swallow_io(): with time_limit(timeout): exec(check_program, exec_globals) @@ -93,7 +93,7 @@ def check_correctness(problem: Dict, completion: str, timeout: float, shutil.rmtree = rmtree os.rmdir = rmdir - os.chdir = chdir + os.chdir = os_chdir manager = multiprocessing.Manager() result = manager.list() @@ -211,7 +211,7 @@ def get_score(predictions, references, test_set, problem_set): def evaluate_functional_correctness( sample_file: str, - problem_file: dict, + problem_file: dict = None, k: List[int] = [1, 10, 100], n_workers: int = 4, timeout: float = 3.0, @@ -220,6 +220,8 @@ def evaluate_functional_correctness( Evaluates the functional correctness of generated samples, and writes results to f"{sample_file}_results.jsonl.gz" """ + if problem_file is None: + problem_file = {} problems = problem_file @@ -259,7 +261,8 @@ def evaluate_functional_correctness( ks = k pass_at_k = { f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() - for k in ks if (total >= k).all() + for k in ks + if (total >= k).all() } # Finally, save the results in one file: diff --git a/mindspeed_llm/tasks/evaluation/file_utils.py b/mindspeed_llm/tasks/evaluation/file_utils.py index 6e52342fddc42a7fec706bf174105e7ff1c2909a..6f7032c2a7ceccff175f84fbe9d578b2472d78bb 100644 --- a/mindspeed_llm/tasks/evaluation/file_utils.py +++ b/mindspeed_llm/tasks/evaluation/file_utils.py @@ -2,6 +2,7 @@ from functools import reduce import os import stat +import re MAX_PATH_LENGTH = 4096 MAX_FILE_SIZE = 100 * 1024 * 1024 @@ -52,17 +53,36 @@ def safe_open(file_path: str, mode='r', encoding=None, permission_mode=0o600, is mode, encoding=encoding) -def standardize_path(path: str, max_path_length=MAX_PATH_LENGTH, check_link=True): +def standardize_path(path: str, max_path_length=MAX_PATH_LENGTH, check_link=True, check_read=True, check_write=True): """ check path param: path return: data real path after check """ check_path_is_none(path) + if path: + path = os.path.realpath(path) + else: + return None + + if os.path.exists(path): + if check_read and not os.access(path, os.R_OK): + raise RuntimeError(f"File {path} not readable") + + if check_write and not os.access(path, os.W_OK): + raise RuntimeError(f"File {path} not writable") + else: + print(f"Path: {path} not exists") + check_path_length_lt(path, max_path_length) if check_link: check_path_is_link(path) path = os.path.realpath(path) + + pattern = r'(\.|/|_|-|\s|[~0-9a-zA-Z]|[\u4e00-\u9fa5])+' + if not re.fullmatch(pattern, path): + raise RuntimeError(f"Invalid input path: {path}") + return path diff --git a/mindspeed_llm/tasks/models/spec/qwen3_spec.py b/mindspeed_llm/tasks/models/spec/qwen3_spec.py index 925eb72b3493e5ab92e5b0b6ecadf87fa0fd3a5e..60568873ccfd014834b3e980f615d1e35e5998f2 100644 --- a/mindspeed_llm/tasks/models/spec/qwen3_spec.py +++ b/mindspeed_llm/tasks/models/spec/qwen3_spec.py @@ -7,9 +7,10 @@ from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec from megatron.training import get_args +from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules from mindspeed_llm.core.transformer.custom_layers.transformer_engine import PTNorm -from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules + args = get_args() num_experts, moe_grouped_gemm, qk_layernorm = args.num_experts, args.moe_grouped_gemm, args.qk_layernorm diff --git a/mindspeed_llm/tasks/posttrain/rejection_sampling/rejection_sampling.py b/mindspeed_llm/tasks/posttrain/rejection_sampling/rejection_sampling.py index 8f07e6edda8859d174cda383f491f12d9cf79f31..0343f857fcf782ad9f4a6306a64c20ea97bf2dcc 100644 --- a/mindspeed_llm/tasks/posttrain/rejection_sampling/rejection_sampling.py +++ b/mindspeed_llm/tasks/posttrain/rejection_sampling/rejection_sampling.py @@ -2,7 +2,7 @@ import argparse import gc import json import re - +import os import jsonlines import pandas as pd import torch @@ -13,6 +13,7 @@ from vllm.distributed.parallel_state import (destroy_distributed_environment, de from utils import blending_datasets, PromptGtAnswerDataset, apply_GenRM_template, rejection_sampling_processor from mindspeed_llm.tasks.posttrain.verifier.rule_verifier import preprocess_box_response_for_qwen_prompt +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path def clean_up(): @@ -36,13 +37,13 @@ def batch_generate_vllm(args): dummy_strategy.args = args # configure tokenizer - tokenizer = AutoTokenizer.from_pretrained(args.pretrain, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(args.pretrain, trust_remote_code=args.trust_remote_code) # configure model llm = LLM( model=args.pretrain, tensor_parallel_size=args.tp_size, - trust_remote_code=True, + trust_remote_code=args.trust_remote_code, seed=args.seed, max_num_seqs=args.max_num_seqs, enable_prefix_caching=args.enable_prefix_caching, @@ -107,7 +108,7 @@ def batch_GenRM_rejection_sampling(args): llm = LLM( model=args.pretrain, tensor_parallel_size=args.tp_size, - trust_remote_code=True, + trust_remote_code=args.trust_remote_code, seed=args.seed, max_num_seqs=args.max_num_seqs, enable_prefix_caching=args.enable_prefix_caching, @@ -215,9 +216,15 @@ if __name__ == "__main__": parser.add_argument("--iter", type=int, default=None, help="Used to slice the datasets in range iter * rollout_batch_size: (iter + 1) * rollout_batch_size", ) parser.add_argument("--rollout-batch-size", type=int, default=2048, help="Number of samples to generate") + parser.add_argument('--trust-remote-code', + action='store_true', + default=False, + help='enable trust-remote-code for transformer to load model') args = parser.parse_args() + args.output_path = standardize_path(args.output_path, check_write=True) + if args.task and args.task == "generate_vllm": batch_generate_vllm(args) elif args.task and args.task == "rejection_sampling": diff --git a/mindspeed_llm/tasks/posttrain/rejection_sampling/utils.py b/mindspeed_llm/tasks/posttrain/rejection_sampling/utils.py index b5d10c4dd8a23dbcbcbe347f0e4e1eb2d18c9dd6..e397fe44520b127994e91ccf3f4c5010ef46bcab 100644 --- a/mindspeed_llm/tasks/posttrain/rejection_sampling/utils.py +++ b/mindspeed_llm/tasks/posttrain/rejection_sampling/utils.py @@ -31,7 +31,7 @@ def blending_datasets( ext = os.path.splitext(dataset)[-1] # local python script if ext == ".py" or (os.path.isdir(dataset) and os.path.exists(os.path.join(dataset, f"{dataset_basename}.py"))): - data = load_dataset(dataset, trust_remote_code=True) + data = load_dataset(dataset, trust_remote_code=False) strategy.print(f"loaded {dataset} with python script") # local text file elif ext in [".json", ".jsonl", ".csv"]: diff --git a/mindspeed_llm/tasks/posttrain/sft/sft_trainer.py b/mindspeed_llm/tasks/posttrain/sft/sft_trainer.py index 9bd1fb937d8c9edbba9bd6923f1ad4c3eeaa4756..493cf4bb3312fd099eec3ac5ceee46fe96839132 100644 --- a/mindspeed_llm/tasks/posttrain/sft/sft_trainer.py +++ b/mindspeed_llm/tasks/posttrain/sft/sft_trainer.py @@ -13,7 +13,7 @@ from megatron.training import get_timers try: from mindspeed.core.pipeline_parallel.dualpipev.dualpipev_schedules import set_post_process_flag except ImportError: - pass + print("[warning] failed import dualpipe modules, not support dualpipe") from mindspeed_llm.training.utils import get_tune_attention_mask, get_finetune_data_on_this_tp_rank, generate_actual_seq_len from mindspeed_llm.tasks.posttrain.base import BaseTrainer from mindspeed_llm.training.utils import generate_actual_seq_len, set_mtp_batch_list, get_mtp_batch_list diff --git a/mindspeed_llm/tasks/posttrain/trl_ppo/utils.py b/mindspeed_llm/tasks/posttrain/trl_ppo/utils.py index 39da439f6a2b743ef34f1de0fd096e4708fa25e8..69e22b65124dd9af84723bdcd7ed1e9579683862 100644 --- a/mindspeed_llm/tasks/posttrain/trl_ppo/utils.py +++ b/mindspeed_llm/tasks/posttrain/trl_ppo/utils.py @@ -1,5 +1,5 @@ from typing import Union - +import os import torch from megatron.core import mpu, dist_checkpointing @@ -19,6 +19,7 @@ from megatron.training.training import compute_throughputs_and_append_to_progres from megatron.training.utils import unwrap_model, print_rank_0, append_to_progress_log from megatron.training.yaml_arguments import core_transformer_config_from_yaml from mindspeed_llm.tasks.posttrain.orm.orm_model import GPTRewardModel +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path def model_provider(is_reward_model=False, pre_process=True, post_process=True) -> Union[GPTModel]: @@ -137,6 +138,8 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, if save_model_type: save_path = args.save + '/' + save_model_type + save_path = standardize_path(save_path, check_write=True) + ckpt_format = args.ckpt_format if args.use_dist_ckpt else 'torch' print_rank_0('saving checkpoint at iteration {:7d} to {} in {} format'.format( iteration, save_path, ckpt_format)) diff --git a/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/grader.py b/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/grader.py index 1f435401190d2933edfe94566117310b7d7acaae..47cf553a35566bd6477dae7a37fd614908f909f9 100644 --- a/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/grader.py +++ b/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/grader.py @@ -38,7 +38,7 @@ def parse_digits(num): try: return float(num) / 100 except: - pass + return None return None diff --git a/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/parser.py b/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/parser.py index e2805f7443e1ef263a9c09f90a7cd8239f239102..1cd251d4fea9df296c9b775dd19fbf90a9e9f012 100644 --- a/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/parser.py +++ b/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/parser.py @@ -1,5 +1,6 @@ import random import re +import ast from typing import TypeVar, Iterable, List, Union, Any, Dict import regex @@ -75,7 +76,7 @@ def convert_word_number(text: str) -> str: try: text = str(w2n.word_to_num(text)) except ValueError: - pass + return None return text @@ -468,7 +469,7 @@ def extract_theoremqa_answer(pred: str, answer_flag: bool = True): pred = clean_units(pred) try: tmp = str(latex2sympy(pred)) - pred = str(eval(tmp)) + pred = str(ast.literal_eval(tmp)) except Exception: if re.match(r"-?[\d\.]+\s\D+$", pred): pred = pred.split(" ")[0] diff --git a/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/utils.py b/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/utils.py index d84b3fbd82a69279b10c803cdbf16d69a0e7e624..b19ea603b29576ddfc209e861ed742ff535b2dba 100644 --- a/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/utils.py +++ b/mindspeed_llm/tasks/posttrain/verifier/math_eval_toolkit/utils.py @@ -21,16 +21,14 @@ def load_jsonl(file: Union[str, Path]) -> Iterable[Any]: yield json.loads(line) except json.JSONDecodeError as e: print("Error in loading JSON:", line, "Error:", e) - pass except Exception as e: print("Unexpected error in loading:", line, "Error:", e) - pass def save_jsonl(samples, save_path): # ensure path folder = os.path.dirname(save_path) - os.makedirs(folder, exist_ok=True) + os.makedirs(folder, mode=0o750, exist_ok=True) with open(save_path, "w", encoding="utf-8") as f: for sample in samples: diff --git a/mindspeed_llm/tasks/preprocess/data_handler.py b/mindspeed_llm/tasks/preprocess/data_handler.py index 8d4d6cf325b9518c072e04a827aa672827e591eb..2ec2fb879a554b11a9ecf5ee5558df84b5920d28 100644 --- a/mindspeed_llm/tasks/preprocess/data_handler.py +++ b/mindspeed_llm/tasks/preprocess/data_handler.py @@ -30,6 +30,7 @@ from datasets import load_dataset from megatron.core.datasets import indexed_dataset from mindspeed_llm.tasks.preprocess.templates import Prompter, AlpacaTemplate, get_model_template +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path from mindspeed_llm.tasks.posttrain.utils import convert_token_to_id from .decoder_packed_mtf_dataset import _infer_seqlen @@ -626,7 +627,7 @@ class AlpacaStyleProcessRewardHandler(BaseDatasetHandler): concatenated_ids = { "input_ids": [input_token], - "attention_mask":[attention_mask], + "attention_mask": [attention_mask], "labels": [label_token] } @@ -1067,6 +1068,7 @@ def build_dataset(args): # for MOSS, streaming is needed. args.streaming = True if args.hf_datasets_params: + args.hf_datasets_params = standardize_path(args.hf_datasets_params, check_read=True) with open(args.hf_datasets_params, 'r') as fin: param_dict = json.load(fin) return load_dataset(**param_dict) diff --git a/mindspeed_llm/tasks/preprocess/formatter.py b/mindspeed_llm/tasks/preprocess/formatter.py index 489b28abba585e0abf47913e2236930d9a4fe93f..02af7c1f3774e1d775e6d06ce192a52902b4cdf6 100644 --- a/mindspeed_llm/tasks/preprocess/formatter.py +++ b/mindspeed_llm/tasks/preprocess/formatter.py @@ -191,6 +191,7 @@ class ToolFormatter(Formatter): if self.tool_format is None: raise ValueError("Tool format was not found.") + def apply(self, **kwargs) -> SLOTS: content = kwargs.pop("content") try: @@ -202,7 +203,8 @@ class ToolFormatter(Formatter): return [default_tool_formatter(tools)] else: raise NotImplementedError - except Exception: + except Exception as e: + print(f"[warning] Unexpected error processing content: {content}. Error: {e}") return [""] def extract(self, content: str) -> Union[str, Tuple[str, str]]: diff --git a/mindspeed_llm/tasks/preprocess/utils.py b/mindspeed_llm/tasks/preprocess/utils.py index 7f9d2d5523188f960c732afcd8643c7245e72840..90448c54f6bd950f7d161d1d7d3551ab2b715753 100644 --- a/mindspeed_llm/tasks/preprocess/utils.py +++ b/mindspeed_llm/tasks/preprocess/utils.py @@ -23,7 +23,7 @@ from datasets import load_dataset, concatenate_datasets, interleave_datasets from mindspeed_llm.tasks.preprocess.templates import Role from mindspeed_llm.tasks.preprocess.parser import InstructionDatasetAttr - +from mindspeed_llm.tasks.evaluation.file_utils import standardize_path logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -103,6 +103,7 @@ def get_dataset_list(data_args) -> List["InstructionDatasetAttr"]: else: dataset_names = [] + data_args.dataset_dir = standardize_path(data_args.dataset_dir, check_read=True) try: with open(os.path.join(data_args.dataset_dir, DATA_CONFIG), "r") as f: dataset_info = json.load(f) diff --git a/mindspeed_llm/training/tokenizer/tokenizer.py b/mindspeed_llm/training/tokenizer/tokenizer.py index b82b094270224f378691286472a308d25c2e46d3..5b0e48cc3fdbce41a94544397361c20f4ea6aed8 100644 --- a/mindspeed_llm/training/tokenizer/tokenizer.py +++ b/mindspeed_llm/training/tokenizer/tokenizer.py @@ -49,6 +49,7 @@ def build_tokenizer(args): model_max_length=args.seq_length, use_fast=args.tokenizer_not_use_fast, prompt_type=args.prompt_type, + trust_remote_code=args.trust_remote_code, **hf_tokenizer_kwargs ) @@ -108,7 +109,7 @@ class TokenizerAdaptor: class _AutoTokenizer(MegatronTokenizer): """AutoTokenizer for Hf Pretrained model loading.""" - def __init__(self, tokenizer_name_or_path, vocab_extra_ids, model_max_length, use_fast, prompt_type=None, **kwargs): + def __init__(self, tokenizer_name_or_path, vocab_extra_ids, model_max_length, use_fast, prompt_type=None, trust_remote_code=False, **kwargs): name = tokenizer_name_or_path super().__init__(name) hf_tokenizer_kwargs = kwargs @@ -117,7 +118,7 @@ class _AutoTokenizer(MegatronTokenizer): hf_tokenizer_kwargs["model_max_length"] = model_max_length hf_tokenizer_kwargs["use_fast"] = use_fast - hf_tokenizer_kwargs["trust_remote_code"] = True + hf_tokenizer_kwargs["trust_remote_code"] = trust_remote_code self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **hf_tokenizer_kwargs, local_files_only=True) if (prompt_type is None) and (self.tokenizer.pad_token_id is None): self.tokenizer.pad_token_id = self.tokenizer.eos_token_id diff --git a/mindspeed_llm/training/training.py b/mindspeed_llm/training/training.py index 1b6c05d85865a40cede4e00b6005ff28fd13644f..d6ca28e66188dd7b4a2c6e959e196e92315af752 100644 --- a/mindspeed_llm/training/training.py +++ b/mindspeed_llm/training/training.py @@ -18,7 +18,6 @@ import os import gc import sys import json -from datetime import datetime from functools import wraps import logging @@ -27,7 +26,6 @@ import time import torch import torch_npu -from megatron.core.transformer.moe.moe_utils import track_moe_metrics from megatron.training import get_args from megatron.training import get_timers from megatron.training import get_signal_handler diff --git a/mindspeed_llm/training/utils.py b/mindspeed_llm/training/utils.py index 8a9a8238d94680a24f59e26a33c9756b80bc445d..38a5abe6221c9252b4eb8aecf61216766669831d 100644 --- a/mindspeed_llm/training/utils.py +++ b/mindspeed_llm/training/utils.py @@ -18,6 +18,7 @@ import os import stat import random import warnings +import logging from functools import wraps from typing import Optional, Union, List from itertools import takewhile @@ -45,14 +46,16 @@ from mindspeed_llm.tasks.dataset.shared_memory_manager import SharedMemoryManage try: from mindspeed.core.pipeline_parallel.dualpipev.dualpipev_schedules import get_post_process_flag -except Exception: - pass +except Exception as warn_get_post_process_flag: + logging.error(f"Failed to import get_post_process_flag: {warn_get_post_process_flag}") try: _torch_version = PkgVersion(torch.__version__) -except Exception: - # This is a WAR for building docs, where torch is not actually imported +except Exception as warn_torch_ver: + logging.error(f"Failed to get torch version: {warn_torch_ver}") + # 这是一个特殊情况,用于构建文档时torch未被导入 _torch_version = PkgVersion("0.0.0") + logging.warning("Using default torch version '0.0.0' for documentation build.") WRITE_FILE_DEFAULT_FLAGS = os.O_WRONLY | os.O_CREAT diff --git a/preprocess_data.py b/preprocess_data.py index 8d56d4ddf927b0e362235142e51cdb626a171316..f5e390f4679dbe17110351ca8c14f0c11b4a90d9 100644 --- a/preprocess_data.py +++ b/preprocess_data.py @@ -198,6 +198,10 @@ def add_tokenizer_args(parser): default=[], help="The labels represent the correctness of each reasoning step in the entire reasoning process.", ) + parser.add_argument('--trust-remote-code', + action='store_true', + default=False, + help='enable trust-remote-code for transformer to load model') def add_output_args(parser): diff --git a/pretrain_mamba.py b/pretrain_mamba.py index fa99b879e845811a873baa0f7eb9749b0b43617d..419e01cf1d3608709ea15a47cc1505e07ae21e4d 100644 --- a/pretrain_mamba.py +++ b/pretrain_mamba.py @@ -6,6 +6,7 @@ from functools import partial from typing import List, Optional import torch + from mindspeed_llm import megatron_adaptor from megatron.training import get_args from megatron.training import print_rank_0 @@ -64,7 +65,7 @@ def model_provider(pre_process=True, post_process=True) -> MambaModel: if args.spec is not None: mamba_stack_spec = import_module(args.spec) else: - raise("You must provide a valid Mamba layer spec!") + raise "You must provide a valid Mamba layer spec!" model = MambaModel( config=config, @@ -103,13 +104,8 @@ def get_batch(data_iterator): # get batches based on the TP rank you are on batch, actual_seq_len = get_batch_on_this_tp_rank(data_iterator) args = get_args() - if args.return_document_ids and all( - rank == 0 for rank in ( - mpu.get_context_parallel_rank(), - mpu.get_tensor_model_parallel_rank(), - mpu.get_pipeline_model_parallel_rank() - ) - ): + is_rank_0 = (mpu.get_context_parallel_rank() == 0 and mpu.get_tensor_model_parallel_rank() == 0 and mpu.get_pipeline_model_parallel_rank() == 0) + if args.return_document_ids and is_rank_0: print("current idx: {}, current rank: {}, data_parallel_rank: {}, document_ids: {}".format(batch['idx'], torch.distributed.get_rank(), mpu.get_data_parallel_rank(), batch['document_ids'])) batch.pop('document_ids', None) batch.pop('idx', None) diff --git a/tests/st/shell_scripts/chatglm3_gqa_cp4.sh b/tests/st/shell_scripts/chatglm3_gqa_cp4.sh index 3d0551af485cc74b89be9120b64cd2864621098b..3d793ba29287c907bf5d2e531ba59f7cde2de7cc 100644 --- a/tests/st/shell_scripts/chatglm3_gqa_cp4.sh +++ b/tests/st/shell_scripts/chatglm3_gqa_cp4.sh @@ -31,6 +31,7 @@ DISTRIBUTED_ARGS=" " GPT_ARGS=" + --trust-remote-code \ --use-mcore-models \ --manual-gc \ --manual-gc-interval 50 \ diff --git a/tests/st/shell_scripts/deepseek_500b_tp1_pp2_ep2_cp2_overlap.sh b/tests/st/shell_scripts/deepseek_500b_tp1_pp2_ep2_cp2_overlap.sh index 7bd4b9f2d8f9df65daa1f811657859b85ec12f73..6815a33636eddf518edb3aa8d7f621c3e2798411 100644 --- a/tests/st/shell_scripts/deepseek_500b_tp1_pp2_ep2_cp2_overlap.sh +++ b/tests/st/shell_scripts/deepseek_500b_tp1_pp2_ep2_cp2_overlap.sh @@ -46,6 +46,7 @@ MOE_ARGS=" " GPT_ARGS=" + --trust-remote-code \ --use-mcore-models \ --manual-gc \ --manual-gc-interval 50 \ diff --git a/tests/st/shell_scripts/deepseek_v3_mcore_tp1_pp2_ep4.sh b/tests/st/shell_scripts/deepseek_v3_mcore_tp1_pp2_ep4.sh index 1e10e94d4adcb8ebcdf1e32cf1ecb5c56c80e063..96af75d795500a3a31800533cee74fed9848a92d 100644 --- a/tests/st/shell_scripts/deepseek_v3_mcore_tp1_pp2_ep4.sh +++ b/tests/st/shell_scripts/deepseek_v3_mcore_tp1_pp2_ep4.sh @@ -83,6 +83,7 @@ ROPE_ARGS=" GPT_ARGS=" --finetune \ + --trust-remote-code \ --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --noop-layers 2,3 \ --recompute-granularity full \ diff --git a/tests/st/shell_scripts/deepseek_v3_mcore_tp2_pp2_ep2_dualpipev_fb.sh b/tests/st/shell_scripts/deepseek_v3_mcore_tp2_pp2_ep2_dualpipev_fb.sh index 687825d4798fcc3788ede42fd8e43409d19d9d9b..ea6ebf30d05897a371088f01a693ce7327948139 100644 --- a/tests/st/shell_scripts/deepseek_v3_mcore_tp2_pp2_ep2_dualpipev_fb.sh +++ b/tests/st/shell_scripts/deepseek_v3_mcore_tp2_pp2_ep2_dualpipev_fb.sh @@ -102,6 +102,7 @@ ROPE_ARGS=" " GPT_ARGS=" + --trust-remote-code \ --transformer-impl local \ --spec mindspeed_llm.tasks.models.spec.deepseek_spec layer_spec \ --reset-position-ids \ diff --git a/tests/st/shell_scripts/mixtral_mcore_tp4_cp2_ep2_ptd.sh b/tests/st/shell_scripts/mixtral_mcore_tp4_cp2_ep2_ptd.sh index 593e68abbf3e28c03a74ea1204a681be8da0d977..b286515ced252c2c50c7bd178a2bc6aa7fb39e76 100644 --- a/tests/st/shell_scripts/mixtral_mcore_tp4_cp2_ep2_ptd.sh +++ b/tests/st/shell_scripts/mixtral_mcore_tp4_cp2_ep2_ptd.sh @@ -44,6 +44,7 @@ MOE_ARGS=( ) GPT_ARGS=( + --trust-remote-code --use-mcore-models --manual-gc --manual-gc-interval 50 diff --git a/tests/ut/checkpoint/test_checkpoint.json b/tests/ut/checkpoint/test_checkpoint.json index e18c38493f0a5ab4c40a5753f06bcd261915cdcb..7a67649870e57aa7dc60b0bb98e4f4696dac3fbc 100644 --- a/tests/ut/checkpoint/test_checkpoint.json +++ b/tests/ut/checkpoint/test_checkpoint.json @@ -2,6 +2,7 @@ "test_deepseek2_hf2mcore_tp1pp4ep8": [ { "param": { + "trust-remote-code":null, "model-type":"GPT", "load-model-type":"hf", "save-model-type":"mg", @@ -23,6 +24,7 @@ "test_deepseek2_mcore2hf_tp1pp4ep8": [ { "param": { + "trust-remote-code":null, "model-type":"GPT", "load-model-type":"mg", "save-model-type": "hf",