From 24785df2747ee9f31f8b602885513b479fcca5ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=88=E5=AE=87=E9=9A=86?= Date: Wed, 10 Sep 2025 15:06:07 +0800 Subject: [PATCH 1/2] solve 1 process ineigbor_alltoall(v|w) result bug --- ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c | 34 ++++++++++++++++--- .../mca/coll/libnbc/nbc_ineighbor_alltoallv.c | 34 ++++++++++++++++--- .../mca/coll/libnbc/nbc_ineighbor_alltoallw.c | 34 ++++++++++++++++--- 3 files changed, 90 insertions(+), 12 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c index d57fcdf3470..8dea1efad1b 100644 --- a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c +++ b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c @@ -89,14 +89,40 @@ static int nbc_neighbor_alltoall_init(const void *sbuf, int scount, MPI_Datatype /* change recv order to solve the problem of opposite results in loop neigbor under 2 processes */ /* issue can see https://github.com/mpi-forum/mpi-issues/issues/153 */ - for (int i = indegree - 1 ; i >= 0 ; --i) { - if (MPI_PROC_NULL != srcs[i]) { - res = NBC_Sched_recv ((char *) rbuf + (MPI_Aint) rcvext * i * rcount, true, rcount, rtype, srcs[i], schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + int flag = 0; + if (OMPI_COMM_IS_CART(comm)) { + for(int dim = 0; dim c_topo->mtc.cart->ndims; dim++) { + if (comm->c_topo->mtc.cart->dims[dim] == 1) { + flag = 1; break; } } } + if (flag != 0) { + for (int dim = 0; dim < comm->c_topo->mtc.cart->ndims; dim++) { + if (MPI_PROC_NULL != srcs[2 * dim + 1]) { + res = NBC_Sched_recv ((char *) rbuf + (MPI_Aint) rcvext * (2 * dim + 1) * rcount, true, rcount, rtype, srcs[2 * dim + 1], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } + } + if (MPI_PROC_NULL != srcs[2 * dim]) { + res = NBC_Sched_recv ((char *) rbuf + (MPI_Aint) rcvext * (2 * dim) * rcount, true, rcount, rtype, srcs[2 * dim], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } + } + } + } else { + for (int i = indegree - 1 ; i >= 0 ; --i) { + if (MPI_PROC_NULL != srcs[i]) { + res = NBC_Sched_recv ((char *) rbuf + (MPI_Aint) rcvext * i * rcount, true, rcount, rtype, srcs[i], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } + } + } + } free (srcs); diff --git a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallv.c b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallv.c index e871862c2bd..b886fc2cf2c 100644 --- a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallv.c +++ b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallv.c @@ -93,14 +93,40 @@ static int nbc_neighbor_alltoallv_init(const void *sbuf, const int *scounts, con /* simply loop over neighbors and post send/recv operations */ /* change recv order to solve the problem of opposite results in loop neigbor under 2 processes */ /* issue can see https://github.com/mpi-forum/mpi-issues/issues/153 */ - for (int i = indegree - 1 ; i >= 0 ; --i) { - if (srcs[i] != MPI_PROC_NULL) { - res = NBC_Sched_recv ((char *) rbuf + rdispls[i] * rcvext, false, rcounts[i], rtype, srcs[i], schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + int flag = 0; + if (OMPI_COMM_IS_CART(comm)) { + for(int dim = 0; dim c_topo->mtc.cart->ndims; dim++) { + if (comm->c_topo->mtc.cart->dims[dim] == 1) { + flag = 1; break; } } } + if (flag != 0) { + for (int dim = 0; dim < comm->c_topo->mtc.cart->ndims; dim++) { + if (MPI_PROC_NULL != srcs[2 * dim + 1]) { + res = NBC_Sched_recv ((char *) rbuf + rdispls[2 * dim + 1] * rcvext, false, rcounts[2 * dim + 1], rtype, srcs[2 * dim + 1], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } + } + if (MPI_PROC_NULL != srcs[2 * dim]) { + res = NBC_Sched_recv ((char *) rbuf + rdispls[2 * dim] * rcvext, false, rcounts[2 * dim], rtype, srcs[2 * dim], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } + } + } + } else { + for (int i = indegree - 1 ; i >= 0 ; --i) { + if (srcs[i] != MPI_PROC_NULL) { + res = NBC_Sched_recv ((char *) rbuf + rdispls[i] * rcvext, false, rcounts[i], rtype, srcs[i], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } + } + } + } free (srcs); diff --git a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallw.c b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallw.c index 8b7ee0f050c..3bae06ce5c0 100644 --- a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallw.c +++ b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallw.c @@ -79,14 +79,40 @@ static int nbc_neighbor_alltoallw_init(const void *sbuf, const int *scounts, con /* simply loop over neighbors and post send/recv operations */ /* change recv order to solve the problem of opposite results in loop neigbor under 2 processes */ /* issue can see https://github.com/mpi-forum/mpi-issues/issues/153 */ - for (int i = indegree - 1 ; i >= 0 ; --i) { - if (srcs[i] != MPI_PROC_NULL) { - res = NBC_Sched_recv ((char *) rbuf + rdisps[i], false, rcounts[i], rtypes[i], srcs[i], schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + int flag = 0; + if (OMPI_COMM_IS_CART(comm)) { + for(int dim = 0; dim c_topo->mtc.cart->ndims; dim++) { + if (comm->c_topo->mtc.cart->dims[dim] == 1) { + flag = 1; break; } } } + if (flag != 0) { + for (int dim = 0; dim < comm->c_topo->mtc.cart->ndims; dim++) { + if (MPI_PROC_NULL != srcs[2 * dim + 1]) { + res = NBC_Sched_recv ((char *) rbuf + rdisps[2 * dim + 1], false, rcounts[2 * dim + 1], rtypes[2 * dim + 1], srcs[2 * dim + 1], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } + } + if (MPI_PROC_NULL != srcs[2 * dim]) { + res = NBC_Sched_recv ((char *) rbuf + rdisps[2 * dim], false, rcounts[2 * dim], rtypes[2 * dim], srcs[2 * dim], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } + } + } + } else { + for (int i = indegree - 1 ; i >= 0 ; --i) { + if (srcs[i] != MPI_PROC_NULL) { + res = NBC_Sched_recv ((char *) rbuf + rdisps[i], false, rcounts[i], rtypes[i], srcs[i], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } + } + } + } free (srcs); -- Gitee From f469bf9486e3e9d662ff93ef87e7182be521f209 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=88=E5=AE=87=E9=9A=86?= Date: Wed, 10 Sep 2025 16:51:51 +0800 Subject: [PATCH 2/2] cleancode --- ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c | 29 ++++++++++--------- .../mca/coll/libnbc/nbc_ineighbor_alltoallv.c | 29 ++++++++++--------- .../mca/coll/libnbc/nbc_ineighbor_alltoallw.c | 29 ++++++++++--------- 3 files changed, 45 insertions(+), 42 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c index 8dea1efad1b..5323dea9f11 100644 --- a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c +++ b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoall.c @@ -19,6 +19,7 @@ * */ #include "nbc_internal.h" +#include "ompi/mca/topo/base/base.h" /* cannot cache schedules because one cannot check locally if the pattern is the same!! */ #undef NBC_CACHE_SCHEDULE @@ -89,28 +90,28 @@ static int nbc_neighbor_alltoall_init(const void *sbuf, int scount, MPI_Datatype /* change recv order to solve the problem of opposite results in loop neigbor under 2 processes */ /* issue can see https://github.com/mpi-forum/mpi-issues/issues/153 */ - int flag = 0; + /* comm is cart and process of a certain dim is 1 need to special handle */ + bool is_cart_dim_one = false; if (OMPI_COMM_IS_CART(comm)) { - for(int dim = 0; dim c_topo->mtc.cart->ndims; dim++) { + for(int dim = 0 ; dim c_topo->mtc.cart->ndims ; ++dim) { if (comm->c_topo->mtc.cart->dims[dim] == 1) { - flag = 1; + is_cart_dim_one = true; break; } } } - if (flag != 0) { - for (int dim = 0; dim < comm->c_topo->mtc.cart->ndims; dim++) { - if (MPI_PROC_NULL != srcs[2 * dim + 1]) { - res = NBC_Sched_recv ((char *) rbuf + (MPI_Aint) rcvext * (2 * dim + 1) * rcount, true, rcount, rtype, srcs[2 * dim + 1], schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - break; + if (is_cart_dim_one) { + for (int dim = 0; dim < comm->c_topo->mtc.cart->ndims ; ++dim) { + for (int i = 1 ; i >= 0 ; --i) { + if (MPI_PROC_NULL != srcs[2 * dim + i]) { + res = NBC_Sched_recv ((char *) rbuf + (MPI_Aint) rcvext * (2 * dim + i) * rcount, true, rcount, rtype, srcs[2 * dim + i], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } } } - if (MPI_PROC_NULL != srcs[2 * dim]) { - res = NBC_Sched_recv ((char *) rbuf + (MPI_Aint) rcvext * (2 * dim) * rcount, true, rcount, rtype, srcs[2 * dim], schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - break; - } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; } } } else { diff --git a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallv.c b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallv.c index b886fc2cf2c..93c772ab8f0 100644 --- a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallv.c +++ b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallv.c @@ -19,6 +19,7 @@ * */ #include "nbc_internal.h" +#include "ompi/mca/topo/base/base.h" /* cannot cache schedules because one cannot check locally if the pattern is the same!! */ #undef NBC_CACHE_SCHEDULE @@ -93,28 +94,28 @@ static int nbc_neighbor_alltoallv_init(const void *sbuf, const int *scounts, con /* simply loop over neighbors and post send/recv operations */ /* change recv order to solve the problem of opposite results in loop neigbor under 2 processes */ /* issue can see https://github.com/mpi-forum/mpi-issues/issues/153 */ - int flag = 0; + /* comm is cart and process of a certain dim is 1 need to special handle */ + bool is_cart_dim_one = false; if (OMPI_COMM_IS_CART(comm)) { - for(int dim = 0; dim c_topo->mtc.cart->ndims; dim++) { + for(int dim = 0 ; dim c_topo->mtc.cart->ndims ; ++dim) { if (comm->c_topo->mtc.cart->dims[dim] == 1) { - flag = 1; + is_cart_dim_one = true; break; } } } - if (flag != 0) { - for (int dim = 0; dim < comm->c_topo->mtc.cart->ndims; dim++) { - if (MPI_PROC_NULL != srcs[2 * dim + 1]) { - res = NBC_Sched_recv ((char *) rbuf + rdispls[2 * dim + 1] * rcvext, false, rcounts[2 * dim + 1], rtype, srcs[2 * dim + 1], schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - break; + if (is_cart_dim_one) { + for (int dim = 0 ; dim < comm->c_topo->mtc.cart->ndims ; ++dim) { + for (int i = 1 ; i >= 0 ; --i) { + if (MPI_PROC_NULL != srcs[2 * dim + i]) { + res = NBC_Sched_recv ((char *) rbuf + rdispls[2 * dim + i] * rcvext, false, rcounts[2 * dim + i], rtype, srcs[2 * dim + i], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } } } - if (MPI_PROC_NULL != srcs[2 * dim]) { - res = NBC_Sched_recv ((char *) rbuf + rdispls[2 * dim] * rcvext, false, rcounts[2 * dim], rtype, srcs[2 * dim], schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - break; - } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; } } } else { diff --git a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallw.c b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallw.c index 3bae06ce5c0..f012d58ccec 100644 --- a/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallw.c +++ b/ompi/mca/coll/libnbc/nbc_ineighbor_alltoallw.c @@ -19,6 +19,7 @@ * */ #include "nbc_internal.h" +#include "ompi/mca/topo/base/base.h" /* cannot cache schedules because one cannot check locally if the pattern is the same!! */ #undef NBC_CACHE_SCHEDULE @@ -79,28 +80,28 @@ static int nbc_neighbor_alltoallw_init(const void *sbuf, const int *scounts, con /* simply loop over neighbors and post send/recv operations */ /* change recv order to solve the problem of opposite results in loop neigbor under 2 processes */ /* issue can see https://github.com/mpi-forum/mpi-issues/issues/153 */ - int flag = 0; + /* comm is cart and process of a certain dim is 1 need to special handle */ + bool is_cart_dim_one = false; if (OMPI_COMM_IS_CART(comm)) { - for(int dim = 0; dim c_topo->mtc.cart->ndims; dim++) { + for(int dim = 0 ; dim c_topo->mtc.cart->ndims ; ++dim) { if (comm->c_topo->mtc.cart->dims[dim] == 1) { - flag = 1; + is_cart_dim_one = true; break; } } } - if (flag != 0) { - for (int dim = 0; dim < comm->c_topo->mtc.cart->ndims; dim++) { - if (MPI_PROC_NULL != srcs[2 * dim + 1]) { - res = NBC_Sched_recv ((char *) rbuf + rdisps[2 * dim + 1], false, rcounts[2 * dim + 1], rtypes[2 * dim + 1], srcs[2 * dim + 1], schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - break; + if (is_cart_dim_one) { + for (int dim = 0 ; dim < comm->c_topo->mtc.cart->ndims ; ++dim) { + for (int i = 1 ; i >= 0 ; --i) { + if (MPI_PROC_NULL != srcs[2 * dim + i]) { + res = NBC_Sched_recv ((char *) rbuf + rdisps[2 * dim + i], false, rcounts[2 * dim + i], rtypes[2 * dim + i], srcs[2 * dim + i], schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; + } } } - if (MPI_PROC_NULL != srcs[2 * dim]) { - res = NBC_Sched_recv ((char *) rbuf + rdisps[2 * dim], false, rcounts[2 * dim], rtypes[2 * dim], srcs[2 * dim], schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - break; - } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + break; } } } else { -- Gitee