From b1a1387721b1f7b65ed3c84bf6a9f7137677889c Mon Sep 17 00:00:00 2001 From: ivanshan_8170 Date: Tue, 26 Aug 2025 21:13:59 +0800 Subject: [PATCH] lccl2hccl --- include/atb/infer_op_params.h | 2 +- .../all_gather/all_gather_operation.cpp | 39 ++++----- .../all_reduce/all_reduce_operation.cpp | 13 +-- .../all_to_all/all_to_all_lccl_runner.cpp | 2 +- .../all_to_all/all_to_all_operation.cpp | 86 +++++++++++-------- 5 files changed, 75 insertions(+), 67 deletions(-) diff --git a/include/atb/infer_op_params.h b/include/atb/infer_op_params.h index 7896da47..82f65ae0 100644 --- a/include/atb/infer_op_params.h +++ b/include/atb/infer_op_params.h @@ -2409,7 +2409,7 @@ struct AllToAllParam { //! 多通信域并行功能使用结束后,"LCCL_PARALLEL"需要设置为0或者false,否则会导致基础场景性能下降。 std::string commDomain; //! \brief 通信结果对输入进行转置。 - //! 仅当backend为"lccl"时生效 + //! 为true时使用lccl bool transpose = false; //! //! \brief 预留参数 diff --git a/src/ops_infer/all_gather/all_gather_operation.cpp b/src/ops_infer/all_gather/all_gather_operation.cpp index 70615ce1..e4a0e0f6 100644 --- a/src/ops_infer/all_gather/all_gather_operation.cpp +++ b/src/ops_infer/all_gather/all_gather_operation.cpp @@ -35,9 +35,9 @@ template <> Status CreateOperation(const infer::AllGatherParam &opParam, Operati ATB_LOG(ERROR) << "backend is " << opParam.backend << "backend must either be hccl or lccl"; return ERROR_INVALID_PARAM; } - if (opParam.backend == "lccl" && GetSingleton().Is310P()) { - ATB_LOG(ERROR) << "AllGather lccl is not support in Atlas inference products"; - return ERROR_INVALID_PARAM; + if (opParam.backend == "lccl") { + ATB_LOG(WARN) + << "DEPRECATED: backend as lccl is no longer suppported and will be removed soon. Please use hccl instead"; } if (OperationUtil::DistributedInitCheck(opParam) != NO_ERROR) { ATB_LOG(ERROR) << "AllGatherOperation DistributedInitCheck failed"; @@ -84,8 +84,8 @@ Status AllGatherOperation::InferShapeImpl(const SVector &inTensorDes Status AllGatherOperation::InferShapeCheckImpl(const SVector &inTensorDescs) const { if (inTensorDescs.at(0).shape.dimNum >= MAX_DIM) { - ATB_LOG(ERROR) << "inTensor(0) dimNum should < MAX_DIM(8)"; - return ERROR_INVALID_TENSOR_DIM; + ATB_LOG(ERROR) << "inTensor(0) dimNum should < MAX_DIM(8), but got " << inTensorDescs.at(0).shape.dimNum; + return ERROR_INVALID_TENSOR_DIM_NUM; } return NO_ERROR; } @@ -93,15 +93,17 @@ Status AllGatherOperation::InferShapeCheckImpl(const SVector &inTens Status AllGatherOperation::SetupCheckImpl(const SVector &inTensors, const SVector &outTensors) const { if (inTensors.at(0).desc.shape.dimNum >= MAX_DIM) { - ATB_LOG(ERROR) << "inTensor(0) dimNum should < MAX_DIM(8)"; - return ERROR_INVALID_TENSOR_DIM; + ATB_LOG(ERROR) << "inTensor(0) dimNum should < MAX_DIM(8), but got " << inTensorDescs.at(0).shape.dimNum; + return ERROR_INVALID_TENSOR_DIM_NUM; } if (outTensors.at(0).desc.shape.dimNum != (inTensors.at(0).desc.shape.dimNum + 1)) { - ATB_LOG(ERROR) << "outTensor dim should be one larger than inTensor dim"; - return ERROR_INVALID_TENSOR_DIM; + ATB_LOG(ERROR) << "outTensor dimNum[" << outTensors.at(0).desc.shape.dimNum + << "] should be one larger than inTensor dimNum[" << inTensors.at(0).desc.shape.dimNum << "]"; + return ERROR_INVALID_TENSOR_DIM_NUM; } if (outTensors.at(0).desc.shape.dims[0] != param_.rankSize) { - ATB_LOG(ERROR) << "outTensor first dimension does not match rankSize"; + ATB_LOG(ERROR) << "outTensor first dimension[" << outTensors.at(0).desc.shape.dims[0] + << "] does not match rankSize[" << param_.rankSize << "]"; return ERROR_INVALID_TENSOR_DIM; } return NO_ERROR; @@ -109,18 +111,13 @@ Status AllGatherOperation::SetupCheckImpl(const SVector &inTensors, cons std::shared_ptr AllGatherOperation::CreateRunner(Context &context) const { - (void)context; - if (param_.backend == "hccl") { - if (param_.hcclComm == nullptr) { - return std::make_shared(param_, !param_.rankTableFile.empty()); - } else { - return std::make_shared(param_, param_.hcclComm); - } - } else if (param_.backend == "lccl") { - return std::make_shared(param_, context); + if (param_.commMode == infer::CommMode::COMM_MULTI_THREAD) { + return std::make_shared(param_, context); + } + if (param_.hcclComm == nullptr) { + return std::make_shared(param_, !param_.rankTableFile.empty()); } - ATB_LOG(FATAL) << "AllGatherOperation::AllGatherOperation backend " << param_.backend << "is not exist."; - return std::shared_ptr(); + return std::make_shared(param_, param_.hcclComm); } nlohmann::json AllGatherOperation::GetParamJson() const diff --git a/src/ops_infer/all_reduce/all_reduce_operation.cpp b/src/ops_infer/all_reduce/all_reduce_operation.cpp index 6e27695c..93c9840f 100644 --- a/src/ops_infer/all_reduce/all_reduce_operation.cpp +++ b/src/ops_infer/all_reduce/all_reduce_operation.cpp @@ -233,15 +233,10 @@ Status AllReduceOperation::QuantShapeCheck(const TensorDesc &scale, const Tensor std::shared_ptr AllReduceOperation::CreateRunner(Context &context) const { - (void)context; - if (param_.backend == "hccl") { - if (param_.hcclComm == nullptr) { - return std::make_shared(param_, !param_.rankTableFile.empty()); - } else { - return std::make_shared(param_, param_.hcclComm); - } - } else if (param_.backend == "lccl") { - return std::make_shared(param_, context); + if (param_.hcclComm == nullptr) { + return std::make_shared(param_, !param_.rankTableFile.empty()); + } else { + return std::make_shared(param_, param_.hcclComm); } return std::shared_ptr(); } diff --git a/src/ops_infer/all_to_all/all_to_all_lccl_runner.cpp b/src/ops_infer/all_to_all/all_to_all_lccl_runner.cpp index 8faf35f0..d6c7cd03 100644 --- a/src/ops_infer/all_to_all/all_to_all_lccl_runner.cpp +++ b/src/ops_infer/all_to_all/all_to_all_lccl_runner.cpp @@ -49,7 +49,7 @@ Status AllToAllLcclRunner::ExecuteImpl(RunnerVariantPack &runnerVariantPack) GetExecuteStream(runnerVariantPack.context)); } if (ret == Lcal::LCAL_ERROR_PARA_CHECK_FAIL) { - ATB_LOG(ERROR) << "ret: " << ret << " LCCL_PARALLEL should be 0 or fasle"; + ATB_LOG(ERROR) << "ret: " << ret << " LCCL_PARALLEL should be 0 or false"; return ERROR_INVALID_SINGLE_OPERATION_PARAM; } if (ret != 0) { diff --git a/src/ops_infer/all_to_all/all_to_all_operation.cpp b/src/ops_infer/all_to_all/all_to_all_operation.cpp index a0b635d0..21c3cdfd 100644 --- a/src/ops_infer/all_to_all/all_to_all_operation.cpp +++ b/src/ops_infer/all_to_all/all_to_all_operation.cpp @@ -35,10 +35,15 @@ template <> Status CreateOperation(const infer::AllToAllParam &opParam, Operatio return ERROR_INVALID_PARAM; } OP_PARAM_RSV_CHECK(opParam); + ATB_LOG(INFO) << "AlltoAll rank:" << opParam.headNum; if (opParam.backend != "hccl" && opParam.backend != "lccl") { ATB_LOG(ERROR) << "backend is " << opParam.backend << "backend must be hccl or lccl"; return ERROR_INVALID_PARAM; } + if (opParam.backend == "lccl") { + ATB_LOG(WARN) + << "DEPRECATED: backend as lccl is no longer suppported and will be removed soon. Please use hccl instead"; + } const char *socName = aclrtGetSocName(); if (!socName) { ATB_LOG(ERROR) << "aclrtGetSocName failed!"; @@ -54,19 +59,15 @@ template <> Status CreateOperation(const infer::AllToAllParam &opParam, Operatio ATB_LOG(ERROR) << "AllToAll hccl only supports Atlas 800I A2/A3 or Atlas 900 A3 Superpod"; return ERROR_INVALID_PARAM; } - if (opParam.transpose) { - ATB_LOG(ERROR) << "AllToAll hccl doesn't support transpose"; - return ERROR_INVALID_PARAM; - } - } - if (opParam.backend == "lccl" && opParam.rankSize % 2 != 0) { // 2 : Even ranksize - ATB_LOG(ERROR) << "AllToAll lccl only supports even ranksize"; - return ERROR_INVALID_PARAM; } if (OperationUtil::DistributedInitCheck(opParam) != NO_ERROR) { ATB_LOG(ERROR) << "AllToAllOperation DistributedInitCheck failed"; return ERROR_INVALID_PARAM; } + if (opParam.backend == "lccl" && opParam.rankSize % 2 != 0) { // 2 : Even ranksize + ATB_LOG(ERROR) << "AllToAll lccl only supports even ranksize"; + return ERROR_INVALID_PARAM; + } *operation = new (std::nothrow) AllToAllOperation(opParam); if (*operation == nullptr) { ATB_LOG(ERROR) << "failed to new AllToAllOperation"; @@ -103,24 +104,33 @@ Status AllToAllOperation::InferShapeCheckImpl(const SVector &inTenso return NO_ERROR; } if (inTensorDescs.at(0).shape.dimNum != TRANSPOSE_IN_TENSOR_DIM_NUM) { // 2: transpose only support dimNum - ATB_LOG(ERROR) << "inTensor[0] dimNum should be " << TRANSPOSE_IN_TENSOR_DIM_NUM - << ", but got: " << inTensorDescs.at(0).shape.dimNum; + ATB_LOG(ERROR) << GetLogPrefix() << "AllToAll with tranpose: inTensor[0] dimNum should be " + << TRANSPOSE_IN_TENSOR_DIM_NUM << ", but got: " << inTensorDescs.at(0).shape.dimNum; return ERROR_INVALID_TENSOR_DIM_NUM; } if (inTensorDescs.at(0).shape.dims[1] % param_.rankSize != 0) { - ATB_LOG(ERROR) << "intensors[0].dims[0] must be an integer multiple of ranksize but got dims[0]: " - << inTensorDescs.at(0).shape.dims[1] << ", rankSize: " << param_.rankSize; + ATB_LOG(ERROR) + << GetLogPrefix() + << "AllToAll with tranpose: intensors[0].dims[0] must be an integer multiple of ranksize but got dims[0]: " + << inTensorDescs.at(0).shape.dims[1] << ", rankSize: " << param_.rankSize; return ERROR_INVALID_TENSOR_DIM; } int64_t wSize = inTensorDescs.at(0).shape.dims[TRANSPOSE_IN_TENSOR_DIM_NUM - 1] * static_cast(sizeof(inTensorDescs.at(0).dtype)); if (wSize / param_.rankSize >= MAX_W_SIZE) { - ATB_LOG(ERROR) << "intensors[0].dims[1] / rankSize must be no greater than 90K, but got bytes: " << wSize; + ATB_LOG(ERROR) + << GetLogPrefix() + << "AllToAll with tranpose: intensors[0].dims[1] / rankSize must be no greater than 90K, but got bytes: " + << wSize, + ", rankSize: " << param_.rankSize; return ERROR_INVALID_TENSOR_DIM; } uint64_t tensorSize = Utils::GetTensorSize(inTensorDescs.at(0)); if (tensorSize > MAX_TENSOR_SIZE) { - ATB_LOG(ERROR) << "intensors[0] total tensor size must be no greater than 190MB, but got bytes: " << tensorSize; + ATB_LOG(ERROR) + << GetLogPrefix() + << "AllToAll with tranpose: intensors[0] total tensor size must be no greater than 190MB, but got bytes: " + << tensorSize; return ERROR_INVALID_TENSOR_DIM; } return NO_ERROR; @@ -130,7 +140,7 @@ Status AllToAllOperation::InferShapeImpl(const SVector &inTensorDesc SVector &outTensorDescs) const { outTensorDescs.at(0) = inTensorDescs.at(0); - if (param_.backend == "lccl" && param_.transpose) { + if (param_.transpose) { // lccl outTensorDescs.at(0).shape.dims[0] = inTensorDescs.at(0).shape.dims[0] * param_.rankSize; outTensorDescs.at(0).shape.dims[1] = inTensorDescs.at(0).shape.dims[1] / param_.rankSize; } @@ -146,30 +156,39 @@ Status AllToAllOperation::SetupCheckImpl(const SVector &inTensors, const return st; } if (!param_.transpose && !TensorUtil::TensorDescEqual(inTensors.at(0).desc, outTensors.at(0).desc)) { - ATB_LOG(ERROR) << GetLogPrefix() << "intensor desc and outtensor desc should be same"; + ATB_LOG(ERROR) << GetLogPrefix() + << "AllToAll without tranpose: intensor desc and outtensor desc should be same"; return ERROR_INVALID_TENSOR_DIM; } if (param_.transpose) { if (inTensors.at(0).desc.shape.dimNum != TRANSPOSE_IN_TENSOR_DIM_NUM) { - ATB_LOG(ERROR) << "invalid inTensor dimNum, should be 2, but got inTensors[0] dimNum: " - << inTensors.at(0).desc.shape.dimNum; + ATB_LOG(ERROR) + << GetLogPrefix() + << "AllToAll with tranpose: invalid inTensor dimNum, should be 2, but got inTensors[0] dimNum: " + << inTensors.at(0).desc.shape.dimNum; return ERROR_INVALID_TENSOR_DIM_NUM; } if (outTensors.at(0).desc.shape.dimNum != TRANSPOSE_IN_TENSOR_DIM_NUM) { - ATB_LOG(ERROR) << "invalid outTensor dimNum, should be 2, but got outTensors[0] dimNum: " - << outTensors.at(0).desc.shape.dimNum; + ATB_LOG(ERROR) + << GetLogPrefix() + << "AllToAll with tranpose: invalid outTensor dimNum, should be 2, but got outTensors[0] dimNum: " + << outTensors.at(0).desc.shape.dimNum; return ERROR_INVALID_TENSOR_DIM_NUM; } if (outTensors.at(0).desc.shape.dims[0] != inTensors.at(0).desc.shape.dims[0] * param_.rankSize) { - ATB_LOG(ERROR) << "invalid outTensor dims[0] should be intensors[0].dims[0], * rankSize, i.e. " - << inTensors.at(0).desc.shape.dims[0] << " * " << param_.rankSize << ", but got " - << outTensors.at(0).desc.shape.dims[0]; + ATB_LOG(ERROR) + << GetLogPrefix() + << "AllToAll with tranpose: invalid outTensor dims[0] should be intensors[0].dims[0], * rankSize, i.e. " + << inTensors.at(0).desc.shape.dims[0] << " * " << param_.rankSize << ", but got " + << outTensors.at(0).desc.shape.dims[0]; return ERROR_INVALID_TENSOR_DIM; } if (outTensors.at(0).desc.shape.dims[1] * param_.rankSize != inTensors.at(0).desc.shape.dims[1]) { - ATB_LOG(ERROR) << "invalid outTensor dims[1], should be intensors[0].dims[1]/rankSize, i.e. " - << inTensors.at(0).desc.shape.dims[1] << " / " << param_.rankSize << ", but got " - << outTensors.at(0).desc.shape.dims[1]; + ATB_LOG(ERROR) + << GetLogPrefix() + << "AllToAll with tranpose: invalid outTensor dims[1], should be intensors[0].dims[1]/rankSize, i.e. " + << inTensors.at(0).desc.shape.dims[1] << " / " << param_.rankSize << ", but got " + << outTensors.at(0).desc.shape.dims[1]; return ERROR_INVALID_TENSOR_DIM; } } @@ -178,17 +197,14 @@ Status AllToAllOperation::SetupCheckImpl(const SVector &inTensors, const std::shared_ptr AllToAllOperation::CreateRunner(Context &context) const { - (void)context; - if (param_.backend == "hccl") { - if (param_.hcclComm == nullptr) { - return std::make_shared(param_, !param_.rankTableFile.empty()); - } else { - return std::make_shared(param_, param_.hcclComm); - } - } else if (param_.backend == "lccl") { + // only transpose use lccl + if (param_.tranpose || param_.commMode == infer::CommMode::COMM_MULTI_THREAD) { return std::make_shared(param_, context); } - return std::shared_ptr(); + if (param_.hcclComm == nullptr) { + return std::make_shared(param_, !param_.rankTableFile.empty()); + } + return std::make_shared(param_, param_.hcclComm); } nlohmann::json AllToAllOperation::GetParamJson() const -- Gitee