From: Anthony Geay Date: Wed, 4 Nov 2020 15:30:45 +0000 (+0100) Subject: Reduce number of collective calls that kill performance even if mem peak is higher X-Git-Tag: V9_6_asterxx_0~6^2~1 X-Git-Url: http://git.salome-platform.org/gitweb/?a=commitdiff_plain;h=86aef02dc7d347bd9e22f501bbf65872a3bdce28;p=tools%2Fmedcoupling.git Reduce number of collective calls that kill performance even if mem peak is higher --- diff --git a/src/ParaMEDMEM/ParaUMesh.cxx b/src/ParaMEDMEM/ParaUMesh.cxx index 678c5ea9a..7503a36aa 100644 --- a/src/ParaMEDMEM/ParaUMesh.cxx +++ b/src/ParaMEDMEM/ParaUMesh.cxx @@ -88,19 +88,20 @@ MCAuto ParaUMesh::getCellIdsLyingOnNodesTrue(const DataArrayIdT std::unique_ptr nbOfElems(new mcIdType[size]),nbOfElems2(new mcIdType[size]),nbOfElems3(new mcIdType[size]); mcIdType nbOfNodeIdsLoc(globalNodeIds->getNumberOfTuples()); ci.allGather(&nbOfNodeIdsLoc,1,MPI_ID_TYPE,nbOfElems.get(),1,MPI_ID_TYPE,comm); - //store for each proc the local nodeids intercepted by current proc std::vector< MCAuto > tabs(size); + //store for each proc the local nodeids intercepted by current proc + int nbOfCollectiveCalls = 1;// this parameter controls the memory peak // loop to avoid to all procs to have all the nodes per proc - for(int subDiv = 0 ; subDiv < size ; ++subDiv) + for(int subDiv = 0 ; subDiv < nbOfCollectiveCalls ; ++subDiv) { - std::unique_ptr nbOfElemsSp(CommInterface::SplitArrayOfLength(nbOfElems,size,subDiv,size)); + std::unique_ptr nbOfElemsSp(CommInterface::SplitArrayOfLength(nbOfElems,size,subDiv,nbOfCollectiveCalls)); mcIdType nbOfNodeIdsSum(std::accumulate(nbOfElemsSp.get(),nbOfElemsSp.get()+size,0)); std::unique_ptr allGlobalNodeIds(new mcIdType[nbOfNodeIdsSum]); std::unique_ptr nbOfElemsInt( CommInterface::ToIntArray(nbOfElemsSp,size) ); std::unique_ptr offsetsIn( CommInterface::ComputeOffset(nbOfElemsInt,size) ); mcIdType startGlobalNodeIds,endGlobalNodeIds; - DataArray::GetSlice(0,globalNodeIds->getNumberOfTuples(),1,subDiv,size,startGlobalNodeIds,endGlobalNodeIds); - ci.allGatherV(globalNodeIds->begin()+startGlobalNodeIds,endGlobalNodeIds-startGlobalNodeIds,MPI_ID_TYPE,allGlobalNodeIds.get(),nbOfElemsInt.get(),offsetsIn.get(),MPI_ID_TYPE,comm); + DataArray::GetSlice(0,globalNodeIds->getNumberOfTuples(),1,subDiv,nbOfCollectiveCalls,startGlobalNodeIds,endGlobalNodeIds); + ci.allGatherV(globalNodeIds->begin()+startGlobalNodeIds,FromIdType(endGlobalNodeIds-startGlobalNodeIds),MPI_ID_TYPE,allGlobalNodeIds.get(),nbOfElemsInt.get(),offsetsIn.get(),MPI_ID_TYPE,comm); mcIdType offset(0); for(int curRk = 0 ; curRk < size ; ++curRk) { @@ -156,18 +157,19 @@ MCAuto ParaUMesh::getCellIdsLyingOnNodesFalse(const DataArrayId std::unique_ptr nbOfElems(new mcIdType[size]),nbOfElems2(new mcIdType[size]),nbOfElems3(new mcIdType[size]); mcIdType nbOfNodeIdsLoc(globalNodeIds->getNumberOfTuples()); ci.allGather(&nbOfNodeIdsLoc,1,MPI_ID_TYPE,nbOfElems.get(),1,MPI_ID_TYPE,comm); - std::vector< MCAuto > tabs(size); // loop to avoid to all procs to have all the nodes per proc - for(int subDiv = 0 ; subDiv < size ; ++subDiv) + int nbOfCollectiveCalls = 1;// this parameter controls the memory peak + std::vector< MCAuto > tabs(size); + for(int subDiv = 0 ; subDiv < nbOfCollectiveCalls ; ++subDiv) { - std::unique_ptr nbOfElemsSp(CommInterface::SplitArrayOfLength(nbOfElems,size,subDiv,size)); + std::unique_ptr nbOfElemsSp(CommInterface::SplitArrayOfLength(nbOfElems,size,subDiv,nbOfCollectiveCalls)); mcIdType nbOfNodeIdsSum(std::accumulate(nbOfElemsSp.get(),nbOfElemsSp.get()+size,0)); std::unique_ptr allGlobalNodeIds(new mcIdType[nbOfNodeIdsSum]); std::unique_ptr nbOfElemsInt( CommInterface::ToIntArray(nbOfElemsSp,size) ); std::unique_ptr offsetsIn( CommInterface::ComputeOffset(nbOfElemsInt,size) ); mcIdType startGlobalNodeIds,endGlobalNodeIds; - DataArray::GetSlice(0,globalNodeIds->getNumberOfTuples(),1,subDiv,size,startGlobalNodeIds,endGlobalNodeIds); - ci.allGatherV(globalNodeIds->begin()+startGlobalNodeIds,endGlobalNodeIds-startGlobalNodeIds,MPI_ID_TYPE,allGlobalNodeIds.get(),nbOfElemsInt.get(),offsetsIn.get(),MPI_ID_TYPE,comm); + DataArray::GetSlice(0,globalNodeIds->getNumberOfTuples(),1,subDiv,nbOfCollectiveCalls,startGlobalNodeIds,endGlobalNodeIds); + ci.allGatherV(globalNodeIds->begin()+startGlobalNodeIds,FromIdType(endGlobalNodeIds-startGlobalNodeIds),MPI_ID_TYPE,allGlobalNodeIds.get(),nbOfElemsInt.get(),offsetsIn.get(),MPI_ID_TYPE,comm); mcIdType offset(0); for(int curRk = 0 ; curRk < size ; ++curRk) {