src/ParaMEDMEM/OverlapDEC.hxx

   1 // Copyright (C) 2007-2024  CEA, EDF
   2 //
   3 // This library is free software; you can redistribute it and/or
   4 // modify it under the terms of the GNU Lesser General Public
   5 // License as published by the Free Software Foundation; either
   6 // version 2.1 of the License, or (at your option) any later version.
   7 //
   8 // This library is distributed in the hope that it will be useful,
   9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 // Lesser General Public License for more details.
  12 //
  13 // You should have received a copy of the GNU Lesser General Public
  14 // License along with this library; if not, write to the Free Software
  15 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  16 //
  17 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
  18 //
  19 // Author : Anthony Geay (CEA/DEN)
  20
  21 #ifndef __OVERLAPDEC_HXX__
  22 #define __OVERLAPDEC_HXX__
  23
  24 #include "DEC.hxx"
  25 #include "InterpolationOptions.hxx"
  26
  27 #include <mpi.h>
  28 #include <string>
  29
  30 namespace ICoCo {
  31   class MEDDoubleField;
  32 }
  33
  34 namespace MEDCoupling
  35 {
  36   class OverlapInterpolationMatrix;
  37   class OverlapElementLocator;
  38   class ProcessorGroup;
  39   class ParaFIELD;
  40
  41   /*!
  42       \anchor OverlapDEC-det
  43       \class OverlapDEC
  44
  45       \section OverlapDEC-over Overview
  46
  47       The \c OverlapDEC enables the \ref InterpKerRemapGlobal "conservative remapping" of fields between
  48       two parallel codes. This remapping is based on the computation of intersection volumes on
  49       a \b single \b processor \b group. On this processor group are defined two field-templates called A
  50       and B. The computation is possible for 3D meshes, 2D meshes, 3D-surface meshes, 1D meshes and
  51       2D-curve meshes. Dimensions must be similar for the distribution templates A and B.
  52
  53       The main difference with \ref InterpKernelDEC-det "InterpKernelDEC" is that this
  54       \ref para-dec "DEC" works with a *single* processor group, in which processors will share the work.
  55       Consequently each processor manages two \ref MEDCouplingFieldTemplatesPage "field templates" (A and B)
  56       called source and target.
  57       Furthermore all processors in the processor group cooperate in the global interpolation matrix
  58       computation. In this respect \c InterpKernelDEC is a specialization of \c OverlapDEC.
  59
  60       \section ParaMEDMEMOverlapDECAlgorithmDescription Algorithm description
  61
  62       Let's consider the following use case that is ran in ParaMEDMEMTest_OverlapDEC.cxx to describes
  63       the different steps of the computation. The processor group contains 3 processors.
  64       \anchor ParaMEDMEMOverlapDECImgTest1
  65       \image html OverlapDEC1.png "Example split of the source and target mesh among the 3 procs"
  66
  67       \subsection ParaMEDMEMOverlapDECAlgoStep1 Step 1 : Bounding box exchange and global interaction between procs computation.
  68
  69       In order to reduce as much as possible the amount of communications between distant processors,
  70       every processor computes a bounding box for A and B. Then a AllToAll communication is performed
  71       so that
  72       every processor can compute the \b global interactions between processors.
  73       This computation leads every processor to compute the same global TODO list expressed as a list
  74       of pair. A pair ( x, y ) means that proc \b x fieldtemplate A can interact with field-template B of
  75       proc \b y because the two bounding boxes interact.
  76       In the \ref ParaMEDMEMOverlapDECImgTest1 "example above" this computation leads to the following
  77       \b global TODO list :
  78
  79       \b (0,0), (0,1), (1,0), (1,2), (2,0), (2,1), (2,2)
  80
  81       Here the pair (0,2) does not appear because the bounding box of fieldtemplateA of proc#2 does
  82       not intersect that of fieldtemplate B on proc#0. Notice that this pairing is not symmetric!
  83       (0,2) is not the same as (2,0) since source and target can not be swapped.
  84
  85       Stage performed by MEDCoupling::OverlapElementLocator::computeBoundingBoxesAndInteractionList.
  86
  87       \subsection ParaMEDMEMOverlapDECAlgoStep2 Step 2 : Computation of local TODO list
  88
  89       Starting from the global interaction previously computed in \ref ParaMEDMEMOverlapDECAlgoStep1
  90       "Step 1", each proc computes the TODO list per proc. A pair (x,y) can be treated by either proc \#x or proc \#y,
  91       in order to reduce the amount of data transfers among processors.
  92
  93       Several strategies are possible. Historically (setWorkSharingAlgo(0)) the following was implemented:
  94
  95       The algorithm chosen for load balancing is the following : Each processor has
  96       an empty \b local TODO list at the beginning. Then for each pair (k,m) in
  97       \b global TODO list, if proc\#k has less temporary local list than proc\#m pair, (k,m) is added
  98       to temporary local TODO list of proc\#k.
  99       If proc\#m has less temporary local TODO list than proc\#k pair, (k,m) is added to temporary
 100       local TODO list of proc\#m.
 101       If proc\#k and proc\#m have the same amount of temporary local TODO list pair, (k,m) is added to
 102       temporary local TODO list of proc\#k.
 103
 104       In the \ref ParaMEDMEMOverlapDECImgTest1 "example above" this computation leads to the following
 105       local TODO list :
 106
 107       - proc\#0 : (0,0)
 108       - proc\#1 : (0,1),(1,0)
 109       - proc\#2 : (1,2),(2,0),(2,1),(2,2)
 110
 111       This algo is coded in OverlapElementLocator::computeTodoList_original.
 112
 113
 114       Another version of the work sharing algorithm (setWorkSharingAlgo(1), the default now) is provided
 115       in OverlapElementLocator::computeTodoList_new and hopes to be more efficient:
 116
 117       - a job (i,j) is assigned initially to both proc\#i and proc\#j.
 118       - we then scan all the procs, identify the one with the biggest load
 119       - on this proc, we scan all the pairs (i,j) and remove the one corresponding to the less loaded remote proc
 120       - doing so, we have reduced the load of the most loaded proc
 121       - the process is repeated until no more duplicate job is found on each proc.
 122
 123       At the end of this stage each proc knows precisely its \b local TODO list (with regard to interpolation).
 124       The \b local TODO list of other procs than local is kept for future computations.
 125
 126       \subsection ParaMEDMEMOverlapDECAlgoStep3 Step 3 : Matrix echange between procs
 127
 128       The aim now is to exchange (source and target) field-templates between processors.
 129       Knowing for every processor the \b local TODO list, each proc computes the \b exchange TODO list :
 130
 131       In the \ref ParaMEDMEMOverlapDECImgTest1 "example above" (using the first load balancing algorithm described)
 132       the \b exchange TODO list looks like this:
 133
 134       Sending TODO list per proc :
 135
 136       - proc \#0 : Send fieldtemplate A to Proc\#1, Send fieldtemplate B to Proc\#1, Send fieldtemplate
 137       B to Proc\#2
 138       - Proc \#1 : Send fieldtemplate A to Proc\#2, Send fieldtemplate B to Proc\#2
 139       - Proc \#2 : No send.
 140
 141       For example, initial TODO list was indicating (0,1) on proc\#1 meaning that proc\#1 will be in charge
 142       of computing the intersection between part of the source (A) detained on proc\#0 with part of the
 143       target (B) located on proc\#1. Hence proc\#0 needs to send A to proc\#1.
 144
 145       Similarly here it the receiving TODO list per proc :
 146
 147       - proc \#0 : No receiving
 148       - proc \#1 : receiving fieldtemplate A from Proc\#0,  receiving fieldtemplate B from Proc\#0
 149       - proc \#2 : receiving fieldtemplate B from Proc\#0, receiving fieldtemplate A from Proc\#1,
 150       receiving fieldtemplate B from Proc\#1
 151
 152       To avoid as much as possible large volumes of transfers between procs, only relevant parts of
 153       the meshes are sent. In order for proc\#k to send fieldtemplate A to fieldtemplate B
 154       of proc \#m, proc\#k computes the part of mesh A contained in the boundingbox B of proc\#m. It
 155       implies that the corresponding cellIds or nodeIds of the
 156       corresponding part are sent to proc \#m too.
 157
 158       Let's consider the couple (k,m) in the TODO list. This couple is treated by either k or m as
 159       seen in \ref ParaMEDMEMOverlapDECAlgoStep2 "here in Step2".
 160
 161       As will be dealt in Step 6, for final matrix-vector computations, the resulting matrix of the
 162       couple (k,m) wherever it is computed (proc \#k or proc \#m)
 163       will be stored in \b proc\#m (target side).
 164
 165       - If proc \#k is in charge (performs the matrix computation) for this couple (k,m), target ids
 166       (cells or nodes) of the mesh in proc\#m are renumbered, because proc\#m has selected a sub mesh
 167       of the target mesh to avoid large amounts of data to transfer. In this case, as proc\#m is ultimately
 168       in charge of the matrix, proc\#k must keep preciously the
 169       source ids needed to be sent to proc\#m. No problem will appear for matrix assembling in proc\#m
 170       for source ids because no restriction was done.
 171       Concerning source ids to be sent for the matrix-vector computation, proc\#k will know precisely
 172       which source ids field values to send to proc\#m.
 173       This is embodied by OverlapMapping::keepTracksOfTargetIds in proc\#m.
 174
 175       - If proc \#m is in charge (performs matrix computation) for this couple (k,m), source ids (cells
 176       or nodes) of the mesh in proc \#k are renumbered, because proc \#k has selected a sub mesh of the
 177        source mesh to avoid large amounts of data to transfer. In this case as proc \#k is ultimately
 178        in charge of the matrix, proc \#m receives the source ids
 179       from remote proc \#k, and thus the matrix is directly correct, no need for renumbering as
 180        in \ref ParaMEDMEMOverlapDECAlgoStep5 "Step 5". However proc \#k must
 181       keep track of the ids sent to proc \#m for the matrix-vector computation.
 182       This is incarnated by OverlapMapping::keepTracksOfSourceIds in proc\#k.
 183
 184       This step is performed in MEDCoupling::OverlapElementLocator::exchangeMeshes method.
 185
 186       \subsection ParaMEDMEMOverlapDECAlgoStep4 Step 4 : Computation of the interpolation matrix
 187
 188       After mesh exchange in \ref ParaMEDMEMOverlapDECAlgoStep3 "Step3" each processor has all the
 189       required information to treat its \b local TODO list computed in
 190       \ref ParaMEDMEMOverlapDECAlgoStep2 "Step2". This step is potentially CPU costly, which is why
 191       the \b local TODO list per proc is expected to be as well balanced as possible.
 192
 193       The interpolation is performed as the \ref MEDCoupling::MEDCouplingRemapper "remapper" does.
 194
 195       This operation is performed by OverlapInterpolationMatrix::addContribution method.
 196
 197       \subsection ParaMEDMEMOverlapDECAlgoStep5 Step 5 : Global matrix construction.
 198
 199       After having performed the TODO list at the end of \ref ParaMEDMEMOverlapDECAlgoStep4 "Step4"
 200       we need to assemble the final matrix.
 201
 202       The final aim is to have a distributed matrix \f$ M_k \f$ on each proc\#k. In order to reduce
 203       data exchange during the matrix product process,
 204       \f$ M_k \f$ is built using sizeof(Proc group) \c std::vector< \c std::map<int,double> \c >.
 205
 206       For a proc\#k, it is necessary to fetch info of all matrices built in
 207       \ref ParaMEDMEMOverlapDECAlgoStep4 "Step4" where the first element in pair (i,j)
 208       is equal to k.
 209
 210       After this step, the matrix repartition is the following after a call to
 211       MEDCoupling::OverlapMapping::prepare :
 212
 213       - proc\#0 : (0,0),(1,0),(2,0)
 214       - proc\#1 : (0,1),(2,1)
 215       - proc\#2 : (1,2),(2,2)
 216
 217       Tuple (2,1) computed on proc 2 is stored in proc 1 after execution of the function
 218       "prepare". This is an example of item 0 in \ref ParaMEDMEMOverlapDECAlgoStep2 "Step2".
 219       Tuple (0,1) computed on proc 1 is stored in proc 1 too. This is an example of item 1 in \ref ParaMEDMEMOverlapDECAlgoStep2 "Step2".
 220
 221       In the end MEDCoupling::OverlapMapping::_proc_ids_to_send_vector_st will contain :
 222
 223       - Proc\#0 : 0,1
 224       - Proc\#1 : 0,2
 225       - Proc\#2 : 0,1,2
 226
 227       In the end MEDCoupling::OverlapMapping::_proc_ids_to_recv_vector_st will contain :
 228
 229       - Proc\#0 : 0,1,2
 230       - Proc\#1 : 0,2
 231       - Proc\#2 : 1,2
 232
 233       The method in charge to perform this is : MEDCoupling::OverlapMapping::prepare.
 234   */
 235
 236   class OverlapDEC : public DEC, public INTERP_KERNEL::InterpolationOptions
 237   {
 238   public:
 239     OverlapDEC(const std::set<int>& procIds,const MPI_Comm& world_comm=MPI_COMM_WORLD);
 240     virtual ~OverlapDEC();
 241     void release();
 242
 243     void sendRecvData(bool way=true);
 244     void sendData();
 245     void recvData();
 246     void synchronize();
 247     void attachSourceLocalField(ParaFIELD *field, bool ownPt=false);
 248     void attachTargetLocalField(ParaFIELD *field, bool ownPt=false);
 249     void attachSourceLocalField(MEDCouplingFieldDouble *field);
 250     void attachTargetLocalField(MEDCouplingFieldDouble *field);
 251     void attachSourceLocalField(ICoCo::MEDDoubleField *field);
 252     void attachTargetLocalField(ICoCo::MEDDoubleField *field);
 253     ProcessorGroup *getGroup() { return _group; }
 254     bool isInGroup() const;
 255
 256     void setDefaultValue(double val) {_default_field_value = val;}
 257     //! 0 means initial algo from Antho, 1 or 2 means Adrien's algo (2 should be better). Make your choice :-))
 258     void setWorkSharingAlgo(int method)  { _load_balancing_algo = method; }
 259
 260     void debugPrintWorkSharing(std::ostream & ostr) const;
 261   private:
 262     int _load_balancing_algo;
 263
 264     bool _own_group;
 265     OverlapInterpolationMatrix* _interpolation_matrix;
 266     OverlapElementLocator* _locator;
 267     ProcessorGroup *_group;
 268
 269     double _default_field_value;
 270
 271     ParaFIELD *_source_field;
 272     bool _own_source_field;
 273     ParaFIELD *_target_field;
 274     bool _own_target_field;
 275     MPI_Comm _comm;
 276   };
 277 }
 278
 279 #endif