From fd4a2dc2481657121472897dae7a06100d7c91e0 Mon Sep 17 00:00:00 2001
From: abn <adrien.bruneton@cea.fr>
Date: Thu, 7 Dec 2023 21:03:47 +0100
Subject: [PATCH] [OverlapDEC] Bug fix for transfer of multi-compo field

+ adding tests illustrating transfer of several fields
+ doc and minor fix in workSharing algo (was improperly identifying less loaded proc -> should not impact result, just balancing)
+ C++ style loops
+ more debug info
---
 CMakeLists.txt                              |   2 +-
 resources/dev/mc_suppr_valgrind             |  93 +++++++-
 src/ParaMEDMEM/OverlapDEC.hxx               |  80 ++++---
 src/ParaMEDMEM/OverlapElementLocator.cxx    | 237 +++++++++++---------
 src/ParaMEDMEM/OverlapElementLocator.hxx    |  10 +-
 src/ParaMEDMEM/OverlapMapping.cxx           |  17 +-
 src/ParaMEDMEM_Swig/test_InterpKernelDEC.py |  67 +++++-
 src/ParaMEDMEM_Swig/test_OverlapDEC.py      | 122 +++++++---
 8 files changed, 442 insertions(+), 186 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91b2e1048..3053089cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,7 +61,7 @@ INCLUDE(SalomeSetupPlatform)
 #    add_definitions(-Weverything)
 #    add_definitions(-Wno-inconsistent-missing-override)
 #    add_definitions(-Wno-c++98-compat -Wno-c++98-compat-pedantic)
-#    add_definitions(-Wno-sign-conversion)
+#    add_definitions(-Wsign-conversion)
 #    add_definitions(-Wno-switch-enum)
 #
 #    add_definitions(-Wno-documentation-unknown-command) # \image in Doxygen
diff --git a/resources/dev/mc_suppr_valgrind b/resources/dev/mc_suppr_valgrind
index 9609c3887..869db276d 100644
--- a/resources/dev/mc_suppr_valgrind
+++ b/resources/dev/mc_suppr_valgrind
@@ -2,11 +2,100 @@
 # SWIG suppressions for MEDCoupling
 #
 
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   fun:*alloc
+   ...
+   fun:cfunction_vectorcall_FASTCALL
+}
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   fun:*alloc
+   ...
+   fun:cfunction_vectorcall_FASTCALL_KEYWORDS
+}
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   fun:*alloc
+   ...
+   fun:*PyImport_ImportModule*
+}
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   fun:*alloc
+   ...
+   fun:*PyObject_FastCallDict*
+}
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   fun:*alloc
+   ...
+   fun:*PyObject_CallFunctionObjArgs*
+}
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   fun:*alloc
+   ...
+   fun:*ufunc_generic_fastcall*
+}
+
+{
+   <eval_vec>
+   Memcheck:Leak
+   fun:*alloc
+   ...
+   fun:_PyObject_*alloc*
+   ...
+   fun:PyList_New
+   ...
+   fun:_PyEval_EvalFrameDefault
+}
+
+
+{
+   <eval_vec2>
+   Memcheck:Leak
+   fun:*alloc
+   ...
+   fun:POINTER
+   fun:cfunction_vectorcall_O
+   ...
+   fun:_PyEval_Vector
+}
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   fun:*alloc
+   ...
+   fun:_PyObject_GenericSetAttrWithDict
+}
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   fun:*alloc
+   ...
+   fun:unicode_decode_utf8
+}
+
 {
    <malloc>
    Memcheck:Leak
    fun:malloc
-   fun:PyObject_Malloc
+   ...
+   fun:*PyObject_Malloc*
 }
 
 {
@@ -14,7 +103,7 @@
    Memcheck:Leak
    fun:*alloc
    ...
-   fun:PyUnicode_*
+   fun:*PyUnicode_*
 }
 
 {
diff --git a/src/ParaMEDMEM/OverlapDEC.hxx b/src/ParaMEDMEM/OverlapDEC.hxx
index 993c93376..2adf70ead 100644
--- a/src/ParaMEDMEM/OverlapDEC.hxx
+++ b/src/ParaMEDMEM/OverlapDEC.hxx
@@ -69,27 +69,30 @@ namespace MEDCoupling
       In order to reduce as much as possible the amount of communications between distant processors,
       every processor computes a bounding box for A and B. Then a AllToAll communication is performed
       so that
-      every processor can compute the \b global interactions between processor.
+      every processor can compute the \b global interactions between processors.
       This computation leads every processor to compute the same global TODO list expressed as a list
-      of pair. A pair ( x, y ) means that proc \b x fieldtemplate A can interact with fieltemplate B of
+      of pair. A pair ( x, y ) means that proc \b x fieldtemplate A can interact with field-template B of
       proc \b y because the two bounding boxes interact.
       In the \ref ParaMEDMEMOverlapDECImgTest1 "example above" this computation leads to the following
-      a \b global TODO list :
+      \b global TODO list :
 
-      \b (0,0),(0,1),(1,0),(1,2),(2,0),(2,1),(2,2)
+      \b (0,0), (0,1), (1,0), (1,2), (2,0), (2,1), (2,2)
 
       Here the pair (0,2) does not appear because the bounding box of fieldtemplateA of proc#2 does
-      not intersect that of fieldtemplate B on proc#0.
+      not intersect that of fieldtemplate B on proc#0. Notice that this pairing is not symmetric!
+      (0,2) is not the same as (2,0) since source and target can not be swapped.
 
-      Stage performed by MEDCoupling::OverlapElementLocator::computeBoundingBoxes.
+      Stage performed by MEDCoupling::OverlapElementLocator::computeBoundingBoxesAndInteractionList.
 
       \subsection ParaMEDMEMOverlapDECAlgoStep2 Step 2 : Computation of local TODO list
 
       Starting from the global interaction previously computed in \ref ParaMEDMEMOverlapDECAlgoStep1
-      "Step 1", each proc computes the TODO list per proc.
-      The following rules is chosen : a pair (x,y) can be treated by either proc \#x or proc \#y,
-      in order to reduce the amount of data transfers among
-      processors. The algorithm chosen for load balancing is the following : Each processor has
+      "Step 1", each proc computes the TODO list per proc. A pair (x,y) can be treated by either proc \#x or proc \#y,
+      in order to reduce the amount of data transfers among processors.
+
+      Several strategies are possible. Historically (setWorkSharingAlgo(0)) the following was implemented:
+
+      The algorithm chosen for load balancing is the following : Each processor has
       an empty \b local TODO list at the beginning. Then for each pair (k,m) in
       \b global TODO list, if proc\#k has less temporary local list than proc\#m pair, (k,m) is added
       to temporary local TODO list of proc\#k.
@@ -105,20 +108,28 @@ namespace MEDCoupling
       - proc\#1 : (0,1),(1,0)
       - proc\#2 : (1,2),(2,0),(2,1),(2,2)
 
-      The algorithm described here is not perfect for this use case, we hope to enhance it soon.
+      This algo is coded in OverlapElementLocator::computeTodoList_original.
+
 
-      At this stage each proc knows precisely its \b local TODO list (with regard to interpolation).
-      The \b local TODO list of other procs than local
-      is kept for future computations.
+      Another version of the work sharing algorithm (setWorkSharingAlgo(1), the default now) is provided
+      in OverlapElementLocator::computeTodoList_new and hopes to be more efficient:
+
+      - a job (i,j) is assigned initially to both proc\#i and proc\#j.
+      - we then scan all the procs, identify the one with the biggest load
+      - on this proc, we scan all the pairs (i,j) and remove the one corresponding to the less loaded remote proc
+      - doing so, we have reduced the load of the most loaded proc
+      - the process is repeated until no more duplicate job is found on each proc.
+
+      At the end of this stage each proc knows precisely its \b local TODO list (with regard to interpolation).
+      The \b local TODO list of other procs than local is kept for future computations.
 
       \subsection ParaMEDMEMOverlapDECAlgoStep3 Step 3 : Matrix echange between procs
 
-      Knowing the \b local TODO list, the aim now is to exchange field-templates between procs.
-      Each proc computes knowing TODO list per
-      proc computed in \ref ParaMEDMEMOverlapDECAlgoStep2 "Step 2" the exchange TODO list :
+      The aim now is to exchange (source and target) field-templates between processors.
+      Knowing for every processor the \b local TODO list, each proc computes the \b exchange TODO list :
 
-      In the \ref ParaMEDMEMOverlapDECImgTest1 "example above" the exchange TODO list gives the
-      following results :
+      In the \ref ParaMEDMEMOverlapDECImgTest1 "example above" (using the first load balancing algorithm described)
+      the \b exchange TODO list looks like this:
 
       Sending TODO list per proc :
 
@@ -127,7 +138,11 @@ namespace MEDCoupling
       - Proc \#1 : Send fieldtemplate A to Proc\#2, Send fieldtemplate B to Proc\#2
       - Proc \#2 : No send.
 
-      Receiving TODO list per proc :
+      For example, initial TODO list was indicating (0,1) on proc\#1 meaning that proc\#1 will be in charge
+      of computing the intersection between part of the source (A) detained on proc\#0 with part of the
+      target (B) located on proc\#1. Hence proc\#0 needs to send A to proc\#1.
+
+      Similarly here it the receiving TODO list per proc :
 
       - proc \#0 : No receiving
       - proc \#1 : receiving fieldtemplate A from Proc\#0,  receiving fieldtemplate B from Proc\#0
@@ -135,8 +150,8 @@ namespace MEDCoupling
       receiving fieldtemplate B from Proc\#1
 
       To avoid as much as possible large volumes of transfers between procs, only relevant parts of
-      meshes are sent. In order for proc\#k to send fieldtemplate A to fieldtemplate B
-      of proc \#m., proc\#k computes the part of mesh A contained in the boundingbox B of proc\#m. It
+      the meshes are sent. In order for proc\#k to send fieldtemplate A to fieldtemplate B
+      of proc \#m, proc\#k computes the part of mesh A contained in the boundingbox B of proc\#m. It
       implies that the corresponding cellIds or nodeIds of the
       corresponding part are sent to proc \#m too.
 
@@ -145,17 +160,17 @@ namespace MEDCoupling
 
       As will be dealt in Step 6, for final matrix-vector computations, the resulting matrix of the
       couple (k,m) wherever it is computed (proc \#k or proc \#m)
-      will be stored in \b proc\#m.
+      will be stored in \b proc\#m (target side).
 
       - If proc \#k is in charge (performs the matrix computation) for this couple (k,m), target ids
-      (cells or nodes) of the mesh in proc \#m are renumbered, because proc \#m has seelected a sub mesh
-      of the target mesh to avoid large amounts of data to transfer. In this case as proc \#m is ultimately
-       in charge of the matrix, proc \#k must keep preciously the
-      source ids needed to be sent to proc\#m. No problem will appear for matrix assembling in proc m
+      (cells or nodes) of the mesh in proc\#m are renumbered, because proc\#m has selected a sub mesh
+      of the target mesh to avoid large amounts of data to transfer. In this case, as proc\#m is ultimately
+      in charge of the matrix, proc\#k must keep preciously the
+      source ids needed to be sent to proc\#m. No problem will appear for matrix assembling in proc\#m
       for source ids because no restriction was done.
-      Concerning source ids to be sent for the matrix-vector computation, proc k will know precisely
-      which source ids field values to send to proc \#m.
-      This is embodied by OverlapMapping::keepTracksOfTargetIds in proc m.
+      Concerning source ids to be sent for the matrix-vector computation, proc\#k will know precisely
+      which source ids field values to send to proc\#m.
+      This is embodied by OverlapMapping::keepTracksOfTargetIds in proc\#m.
 
       - If proc \#m is in charge (performs matrix computation) for this couple (k,m), source ids (cells
       or nodes) of the mesh in proc \#k are renumbered, because proc \#k has selected a sub mesh of the
@@ -164,7 +179,7 @@ namespace MEDCoupling
       from remote proc \#k, and thus the matrix is directly correct, no need for renumbering as
        in \ref ParaMEDMEMOverlapDECAlgoStep5 "Step 5". However proc \#k must
       keep track of the ids sent to proc \#m for the matrix-vector computation.
-      This is incarnated by OverlapMapping::keepTracksOfSourceIds in proc k.
+      This is incarnated by OverlapMapping::keepTracksOfSourceIds in proc\#k.
 
       This step is performed in MEDCoupling::OverlapElementLocator::exchangeMeshes method.
 
@@ -173,8 +188,7 @@ namespace MEDCoupling
       After mesh exchange in \ref ParaMEDMEMOverlapDECAlgoStep3 "Step3" each processor has all the
       required information to treat its \b local TODO list computed in
       \ref ParaMEDMEMOverlapDECAlgoStep2 "Step2". This step is potentially CPU costly, which is why
-      the \b local TODO list per proc is expected to
-      be as well balanced as possible.
+      the \b local TODO list per proc is expected to be as well balanced as possible.
 
       The interpolation is performed as the \ref MEDCoupling::MEDCouplingRemapper "remapper" does.
 
diff --git a/src/ParaMEDMEM/OverlapElementLocator.cxx b/src/ParaMEDMEM/OverlapElementLocator.cxx
index 5ff59d1bc..1192ad9ea 100644
--- a/src/ParaMEDMEM/OverlapElementLocator.cxx
+++ b/src/ParaMEDMEM/OverlapElementLocator.cxx
@@ -123,8 +123,7 @@ namespace MEDCoupling
                              _domain_bounding_boxes,bbSize, MPI_DOUBLE, 
                              *comm);
   
-    // Computation of all pairs needing an interpolation pairs are duplicated now !
-    
+    // Computation of all pairs needing an interpolation - pairs are duplicated now !
     _proc_pairs.clear();//first is source second is target
     _proc_pairs.resize(_group.size());
     for(int i=0;i<_group.size();i++)
@@ -133,18 +132,19 @@ namespace MEDCoupling
           _proc_pairs[i].push_back(j);
   }
 
+  /*! See main OverlapDEC documentation for an explanation on this one. This is the original work sharing algorithm.
+   */
   void OverlapElementLocator::computeTodoList_original()
   {
     // OK now let's assigning as balanced as possible, job to each proc of group
     _all_todo_lists.resize(_group.size());
-    int i=0;
-    for(std::vector< std::vector< int > >::const_iterator it1=_proc_pairs.begin();it1!=_proc_pairs.end();it1++,i++)
-      for(std::vector< int >::const_iterator it2=(*it1).begin();it2!=(*it1).end();it2++)
+    for(int i = 0; i < _group.size(); i++)
+      for(const int j: _proc_pairs[i])
         {
-          if(_all_todo_lists[i].size()<=_all_todo_lists[*it2].size())//it includes the fact that i==*it2
-            _all_todo_lists[i].push_back(ProcCouple(i,*it2));
+          if(_all_todo_lists[i].size()<=_all_todo_lists[j].size())//it includes the fact that i==j
+            _all_todo_lists[i].push_back(ProcCouple(i,j));
           else
-            _all_todo_lists[*it2].push_back(ProcCouple(i,*it2));
+            _all_todo_lists[j].push_back(ProcCouple(i,j));
         }
     //Keeping todo list of current proc. _to_do_list contains a set of pair where at least _group.myRank() appears once.
     //This proc will be in charge to perform interpolation of any of element of '_to_do_list'
@@ -157,13 +157,13 @@ namespace MEDCoupling
 #ifdef DEC_DEBUG
     std::stringstream scout;
     scout << "(" << myProcId << ") my TODO list is: ";
-    for (std::vector< ProcCouple >::const_iterator itdbg=_to_do_list.begin(); itdbg!=_to_do_list.end(); itdbg++)
-      scout << "(" << (*itdbg).first << "," << (*itdbg).second << ")";
+    for (const ProcCouple& pc: _to_do_list)
+      scout << "(" << pc.first << "," << pc.second << ")";
     std::cout << scout.str() << "\n";
 #endif
   }
 
-  /* More efficient (?) work sharing algorithm: a job (i,j) is initially assigned twice: to proc#i and to proc#j.
+  /*! More efficient (?) work sharing algorithm: a job (i,j) is initially assigned twice: to proc#i and to proc#j.
    * Then try to reduce as much as possible the variance of the num of jobs per proc by selecting the right duplicate
    * to remove:
    *  - take the most loaded proc i,
@@ -175,58 +175,66 @@ namespace MEDCoupling
   {
     using namespace std;
     int infinity = std::numeric_limits<int>::max();
+
+    //
     // Initialisation
+    //
     int grp_size = _group.size();
+    //    For each proc, a map giving for a job (=an interaction (i,j)) its 'load', i.e. the amount of work found on proc #j
     vector < map<ProcCouple, int> > full_set(grp_size );
-    int srcProcID = 0;
-    for(vector< vector< int > >::const_iterator it = _proc_pairs.begin(); it != _proc_pairs.end(); it++, srcProcID++)
-      for (vector< int >::const_iterator it2=(*it).begin(); it2 != (*it).end(); it2++)
-      {
-        // Here a pair of the form (i,i) is added only once!
-        int tgtProcID = *it2;
-        ProcCouple cpl = make_pair(srcProcID, tgtProcID);
-        full_set[srcProcID][cpl] = -1;
-        full_set[tgtProcID][cpl] = -1;
-      }
-    int procID = 0;
-    vector < map<ProcCouple, int> > ::iterator itVector;
-    map<ProcCouple, int>::iterator itMap;
-    for(itVector = full_set.begin(); itVector != full_set.end(); itVector++, procID++)
-      for (itMap=(*itVector).begin(); itMap != (*itVector).end(); itMap++)
+    for(int srcProcID = 0; srcProcID < _group.size(); srcProcID++)
+      for(const int tgtProcID: _proc_pairs[srcProcID])
         {
-          const ProcCouple & cpl = (*itMap).first;
-          if (cpl.first == cpl.second)
-            // special case - this couple can not be removed in the future
-            (*itMap).second = infinity;
-          else
-            {
-            if(cpl.first == procID)
-              (*itMap).second = (int)full_set[cpl.second].size();
-            else // cpl.second == srcProcID
-              (*itMap).second = (int)full_set[cpl.first].size();
-            }
+          // Here a pair of the form (i,i) is added only once!
+          ProcCouple cpl = make_pair(srcProcID, tgtProcID);
+          // The interaction (i,j) is initially given to both procs #i and #j - load is initialised at -1:
+          full_set[srcProcID][cpl] = -1;
+          full_set[tgtProcID][cpl] = -1;
         }
+    //    Compute load:
+    int procID = 0;
+    for(auto& itVector : full_set)
+      {
+        for (auto &mapIt : itVector)
+          {
+            const ProcCouple & cpl = mapIt.first;
+            if (cpl.first == cpl.second)   // interaction (i,i) : can not be removed:
+              // special case - this couple can not be removed in the future
+              mapIt.second = infinity;
+            else
+              {
+                if(cpl.first == procID)
+                  mapIt.second = (int)full_set[cpl.second].size();
+                else // cpl.second == srcProcID
+                  mapIt.second = (int)full_set[cpl.first].size();
+              }
+          }
+        procID++;
+      }
     INTERP_KERNEL::AutoPtr<bool> proc_valid = new bool[grp_size];
     fill((bool *)proc_valid, proc_valid+grp_size, true);
 
+    //
     // Now the algo:
-    while (find((bool *)proc_valid, proc_valid+grp_size, true) != proc_valid+grp_size)
+    //
+    while (find((bool *)proc_valid, proc_valid+grp_size, true) != proc_valid+grp_size) // as long as proc_valid is not full of 'false'
       {
         // Most loaded proc:
         int max_sz = -1, max_id = -1;
-        for(itVector = full_set.begin(), procID=0; itVector != full_set.end(); itVector++, procID++)
+        int procID = 0;
+        for(const auto& a_set: full_set)
           {
-            int sz = (int)(*itVector).size();
+            int sz = (int)a_set.size();
             if (proc_valid[procID] && sz > max_sz)
               {
                 max_sz = sz;
                 max_id = procID;
               }
+            procID++;
           }
 
         // Nothing more to do:
-        if (max_sz == -1)
-          break;
+        if (max_sz == -1) break;
         // For this proc, job with less loaded second proc:
         int min_sz = infinity;
         map<ProcCouple, int> & max_map = full_set[max_id];
@@ -235,20 +243,25 @@ namespace MEDCoupling
           {
           // Use a reverse iterator here increases our chances to hit a couple of the form (i, myProcId)
           // meaning that the final matrix computed won't have to be sent: save some comm.
-          map<ProcCouple, int> ::const_reverse_iterator ritMap;
-          for(ritMap=max_map.rbegin(); ritMap != max_map.rend(); ritMap++)
+          for(auto ritMap=max_map.rbegin(); ritMap != max_map.rend(); ritMap++)
             if ((*ritMap).second < min_sz)
-              hit_cpl = (*ritMap).first;
+              {
+                hit_cpl = (*ritMap).first;
+                min_sz = (*ritMap).second;
+              }
           }
         else
           {
-            for(itMap=max_map.begin(); itMap != max_map.end(); itMap++)
-              if ((*itMap).second < min_sz)
-                hit_cpl = (*itMap).first;
+            for(const auto& mapIt : max_map)
+              if (mapIt.second < min_sz)
+                {
+                  hit_cpl = mapIt.first;
+                  min_sz = mapIt.second;
+                }
           }
         if (hit_cpl.first == -1)
           {
-            // Plouf. Current proc 'max_id' can not be reduced. Invalid it:
+            // Plouf. Current proc 'max_id' can not be reduced. Invalid it and move next:
             proc_valid[max_id] = false;
             continue;
           }
@@ -262,32 +275,40 @@ namespace MEDCoupling
 
         // Now update all counts of valid maps:
         procID = 0;
-        for(itVector = full_set.begin(); itVector != full_set.end(); itVector++, procID++)
-          if(proc_valid[procID] && procID != max_id)
-            for(itMap = (*itVector).begin(); itMap != (*itVector).end(); itMap++)
-              {
-                const ProcCouple & cpl = (*itMap).first;
-                // Unremovable item:
-                if ((*itMap).second == infinity)
-                  continue;
-                if (cpl.first == max_id || cpl.second == max_id)
-                  (*itMap).second--;
-              }
+        for(auto& itVector: full_set)
+          {
+            if(proc_valid[procID] && procID != max_id)
+              for(auto& mapIt: itVector)
+                {
+                  const ProcCouple & cpl = mapIt.first;
+                  if (mapIt.second == infinity)  // Unremovable item:
+                    continue;
+                  if (cpl.first == max_id || cpl.second == max_id)
+                    mapIt.second--;
+                }
+            procID++;
+          }
       }
+
+    //
     // Final formatting - extract remaining keys in each map:
-    int myProcId=_group.myRank();
+    //
+    int myProcId = _group.myRank();
     _all_todo_lists.resize(grp_size);
     procID = 0;
-    for(itVector = full_set.begin(); itVector != full_set.end(); itVector++, procID++)
-      for(itMap = (*itVector).begin(); itMap != (*itVector).end(); itMap++)
-          _all_todo_lists[procID].push_back((*itMap).first);
+    for(const auto& itVector: full_set)
+      {
+        for(const auto& mapIt: itVector)
+          _all_todo_lists[procID].push_back(mapIt.first);
+        procID++;
+      }
     _to_do_list=_all_todo_lists[myProcId];
 
 #ifdef DEC_DEBUG
     std::stringstream scout;
     scout << "(" << myProcId << ") my TODO list is: ";
-    for (std::vector< ProcCouple >::const_iterator itdbg=_to_do_list.begin(); itdbg!=_to_do_list.end(); itdbg++)
-      scout << "(" << (*itdbg).first << "," << (*itdbg).second << ")";
+    for (const ProcCouple& pc: _to_do_list)
+      scout << "(" << pc.first << "," << pc.second << ")";
     std::cout << scout.str() << "\n";
 #endif
   }
@@ -308,22 +329,28 @@ namespace MEDCoupling
     int myProcId=_group.myRank();
     _procs_to_send_mesh.clear();
     _procs_to_send_field.clear();
-    for(int i=_group.size()-1;i>=0;i--)
+    for(int i=0;i<_group.size();i++)
       {
-        const std::vector< ProcCouple >& anRemoteProcToDoList=_all_todo_lists[i];
-        for(std::vector< ProcCouple >::const_iterator it=anRemoteProcToDoList.begin();it!=anRemoteProcToDoList.end();it++)
+        for(const ProcCouple& pc: _all_todo_lists[i])
           {
-            if((*it).first==myProcId)
+            if(pc.first==myProcId)
               {
                 if(i!=myProcId)
                   _procs_to_send_mesh.push_back(Proc_SrcOrTgt(i,true));
-                _procs_to_send_field.push_back((*it).second);
+                _procs_to_send_field.push_back(pc.second);
               }
-            if((*it).second==myProcId)
+            if(pc.second==myProcId)
               if(i!=myProcId)
                 _procs_to_send_mesh.push_back(Proc_SrcOrTgt(i,false));
           }
       }
+#ifdef DEC_DEBUG
+    std::stringstream scout;
+    scout << "(" << _group.myRank() << ") PROC TO SEND list is: ";
+    for (const auto& pc: _procs_to_send_mesh)
+      scout << "(" << pc.first << "," << (pc.second ? "src":"tgt") << ") ";
+    std::cout << scout.str() << "\n";
+#endif
   }
 
 
@@ -335,41 +362,36 @@ namespace MEDCoupling
   {
     int myProcId=_group.myRank();
     //starting to receive every procs whose id is lower than myProcId.
-    std::vector< ProcCouple > toDoListForFetchRemaining;
-    for(std::vector< ProcCouple >::const_iterator it=_to_do_list.begin();it!=_to_do_list.end();it++)
+    std::vector<ProcCouple> toDoListForFetchRemaining;
+    for (const ProcCouple& pc: _to_do_list)
       {
-        int first = (*it).first, second = (*it).second;
-        if(first!=second)
+        if(pc.first == pc.second) continue; // no xchg needed
+
+        if(pc.first==myProcId)
           {
-            if(first==myProcId)
-              {
-                if(second<myProcId)
-                  receiveRemoteMeshFrom(second,false);
-                else
-                  toDoListForFetchRemaining.push_back(ProcCouple(first,second));
-              }
+            if(pc.second<myProcId)
+              receiveRemoteMeshFrom(pc.second,false);
             else
-              {//(*it).second==myProcId
-                if(first<myProcId)
-                  receiveRemoteMeshFrom(first,true);
-                else
-                  toDoListForFetchRemaining.push_back(ProcCouple(first,second));
-              }
+              toDoListForFetchRemaining.push_back(ProcCouple(pc.first, pc.second));
+          }
+        else
+          {//pc.second==myProcId
+            if(pc.first<myProcId)
+              receiveRemoteMeshFrom(pc.first,true);
+            else
+              toDoListForFetchRemaining.push_back(ProcCouple(pc.first, pc.second));
           }
       }
     //sending source or target mesh to remote procs
-    for(std::vector< Proc_SrcOrTgt >::const_iterator it2=_procs_to_send_mesh.begin();it2!=_procs_to_send_mesh.end();it2++)
-      sendLocalMeshTo((*it2).first,(*it2).second,matrix);
+    for (const Proc_SrcOrTgt& pst: _procs_to_send_mesh)
+      sendLocalMeshTo(pst.first, pst.second,matrix);
     //fetching remaining meshes
-    for(std::vector< ProcCouple >::const_iterator it=toDoListForFetchRemaining.begin();it!=toDoListForFetchRemaining.end();it++)
-      {
-        if((*it).first!=(*it).second)
-          {
-            if((*it).first==myProcId)
-              receiveRemoteMeshFrom((*it).second,false);
-            else//(*it).second==myProcId
-              receiveRemoteMeshFrom((*it).first,true);
-          }
+    for (const ProcCouple& pc: toDoListForFetchRemaining)
+      {  // NB: here pc.first always != from pc.second
+        if(pc.first==myProcId)
+          receiveRemoteMeshFrom(pc.second,false);
+        else//pc.second==myProcId
+          receiveRemoteMeshFrom(pc.first,true);
       }
   }
   
@@ -452,6 +474,14 @@ namespace MEDCoupling
    */
   void OverlapElementLocator::sendLocalMeshTo(int procId, bool sourceOrTarget, OverlapInterpolationMatrix& matrix) const
   {
+#ifdef DEC_DEBUG
+    int rank = _group.myRank();
+    std::string st = sourceOrTarget ? "src" : "tgt";
+    std::stringstream scout;
+    scout << "(" << rank << ") SEND part of " << st << " TO: " << procId;
+    std::cout << scout.str() << "\n";
+#endif
+
    //int myProcId=_group.myRank();
    const double *distant_bb=0;
    MEDCouplingPointSet *local_mesh=0;
@@ -486,6 +516,13 @@ namespace MEDCoupling
    */
   void OverlapElementLocator::receiveRemoteMeshFrom(int procId, bool sourceOrTarget)
   {
+#ifdef DEC_DEBUG
+    int rank = _group.myRank();
+    std::string st = sourceOrTarget ? "src" : "tgt";
+    std::stringstream scout;
+    scout << "(" << rank << ") RCV part of " << st << " FROM: " << procId;
+    std::cout << scout.str() << "\n";
+#endif
     DataArrayIdType *old2new_map=0;
     MEDCouplingPointSet *m=0;
     receiveMesh(procId,m,old2new_map);
diff --git a/src/ParaMEDMEM/OverlapElementLocator.hxx b/src/ParaMEDMEM/OverlapElementLocator.hxx
index 0b3f8702e..5d44c2ff9 100644
--- a/src/ParaMEDMEM/OverlapElementLocator.hxx
+++ b/src/ParaMEDMEM/OverlapElementLocator.hxx
@@ -73,7 +73,7 @@ namespace MEDCoupling
   private:
     typedef MCAuto< MEDCouplingPointSet >  AutoMCPointSet;
     typedef MCAuto< DataArrayIdType >      AutoDAInt;
-    typedef std::pair<int,bool>  Proc_SrcOrTgt;  // a key indicating a proc ID and whether the data is for source mesh/field or target mesh/field
+    typedef std::pair<int,bool>  Proc_SrcOrTgt;  ///< a key indicating a proc ID and whether the data is for source mesh/field or target mesh/field
 
     static const int START_TAG_MESH_XCH;
 
@@ -93,14 +93,16 @@ namespace MEDCoupling
     std::vector< ProcCouple > _to_do_list;
     //! list of procs the local proc will have to send mesh data to:
     std::vector< Proc_SrcOrTgt > _procs_to_send_mesh;
-//    /*! list of procs the local proc will have to send field data to for the final matrix-vector computation:
-//     * This can be different from _procs_to_send_mesh (restricted to Source) because interpolation matrix bits are computed on a potentially
-//     * different proc than the target one.   */
+    /*! list of procs the local proc will have to send field data to for the final matrix-vector computation:
+     * This can be different from _procs_to_send_mesh (restricted to Source) because interpolation matrix bits are computed on a potentially
+     * different proc than the target one.   */
     std::vector< int > _procs_to_send_field;
     //! Set of distant meshes
     std::map< Proc_SrcOrTgt,  AutoMCPointSet > _remote_meshes;
     //! Set of cell ID mappings for the above distant meshes (because only part of the meshes are exchanged)
     std::map< Proc_SrcOrTgt, AutoDAInt > _remote_elems;
+    //! Bounding boxes (for source and target) for **all** procs.
+    //! Format minmax : Xmin_src,Xmax_src,Ymin_src,Ymax_src,Zmin_src,Zmax_src,Xmin_trg,Xmax_trg,Ymin_trg,Ymax_trg,Zmin_trg,Zmax_trg
     double* _domain_bounding_boxes;
     //! bounding box absolute adjustment
     double _epsAbs;
diff --git a/src/ParaMEDMEM/OverlapMapping.cxx b/src/ParaMEDMEM/OverlapMapping.cxx
index 293929649..203dd3ff3 100644
--- a/src/ParaMEDMEM/OverlapMapping.cxx
+++ b/src/ParaMEDMEM/OverlapMapping.cxx
@@ -544,7 +544,8 @@ void OverlapMapping::multiply(const MEDCouplingFieldDouble *fieldInput, MEDCoupl
                   throw INTERP_KERNEL::Exception("OverlapMapping::multiply(): internal error: SEND: unexpected end iterator in _sent_src_ids!");
                 vals=fieldInput->getArray()->selectByTupleId(*(*isItem11).second);
               }
-            nbsend[procID] = (int)vals->getNbOfElems();
+            nbsend[procID] = (int)vals->getNbOfElems();  // nb of elem = nb_tuples*nb_compo
+            // Flat version of values to send:
             valsToSend.insert(valsToSend.end(),vals->getConstPointer(),vals->getConstPointer()+nbsend[procID]);
           }
 
@@ -569,14 +570,14 @@ void OverlapMapping::multiply(const MEDCouplingFieldDouble *fieldInput, MEDCoupl
                 map <int,mcIdType>::const_iterator isItem11 = _nb_of_rcv_src_ids.find(procID);
                 if (isItem11 == _nb_of_rcv_src_ids.end())
                   throw INTERP_KERNEL::Exception("OverlapMapping::multiply(): internal error: RCV: unexpected end iterator in _nb_of_rcv_src_ids!");
-                nbrecv[procID] = (int)((*isItem11).second);
+                nbrecv[procID] = (int)((*isItem11).second * nbOfCompo);
               }
             else
               {
-                map<int, vector<mcIdType> >::const_iterator isItem11 = _src_ids_zip_recv.find(procID);
-                if (isItem11 == _src_ids_zip_recv.end())
+                map<int, vector<mcIdType> >::const_iterator isItem22 = _src_ids_zip_recv.find(procID);
+                if (isItem22 == _src_ids_zip_recv.end())
                   throw INTERP_KERNEL::Exception("OverlapMapping::multiply(): internal error: RCV: unexpected end iterator in _src_ids_zip_recv!");
-                nbrecv[procID] = (int)((*isItem11).second.size()*nbOfCompo);
+                nbrecv[procID] = (int)((*isItem22).second.size() * nbOfCompo);
               }
           }
     }
@@ -652,7 +653,7 @@ void OverlapMapping::multiply(const MEDCouplingFieldDouble *fieldInput, MEDCoupl
                   transform(localSrcField+((*it3).first)*nbOfCompo,
                             localSrcField+((*it3).first+1)*nbOfCompo,
                             (double *)tmp,
-                            bind2nd(multiplies<double>(),ratio) );
+                            [=](double d) { return d*ratio; });
                   // Accumulate with current value:
                   transform((double *)tmp,(double *)tmp+nbOfCompo,targetPt,targetPt,plus<double>());
                   hit_cells[j] = true;
@@ -709,7 +710,7 @@ void OverlapMapping::multiply(const MEDCouplingFieldDouble *fieldInput, MEDCoupl
                   transform(bigArr+nbrecv2[srcProcID]+((*it4).second)*nbOfCompo,
                             bigArr+nbrecv2[srcProcID]+((*it4).second+1)*nbOfCompo,
                             (double *)tmp,
-                            bind2nd(multiplies<double>(),ratio) );
+                            [=](double d) { return d*ratio; } );
                   transform((double *)tmp,(double *)tmp+nbOfCompo,targetPt,targetPt,plus<double>());
                   hit_cells[tgrIds[j]] = true;
                 }
@@ -738,7 +739,7 @@ void OverlapMapping::multiply(const MEDCouplingFieldDouble *fieldInput, MEDCoupl
                   transform(bigArr+nbrecv2[srcProcID]+((*it3).first)*nbOfCompo,
                             bigArr+nbrecv2[srcProcID]+((*it3).first+1)*nbOfCompo,
                             (double *)tmp,
-                            bind2nd(multiplies<double>(),ratio));
+                            [=](double d) { return d*ratio; } );
                   // Accumulate with current value:
                   transform((double *)tmp,(double *)tmp+nbOfCompo,targetPt,targetPt,plus<double>());
                   hit_cells[j] = true;
diff --git a/src/ParaMEDMEM_Swig/test_InterpKernelDEC.py b/src/ParaMEDMEM_Swig/test_InterpKernelDEC.py
index 0c30b6dad..494489683 100755
--- a/src/ParaMEDMEM_Swig/test_InterpKernelDEC.py
+++ b/src/ParaMEDMEM_Swig/test_InterpKernelDEC.py
@@ -255,6 +255,70 @@ class ParaMEDMEM_IK_DEC_Tests(unittest.TestCase):
         source_group.release()
         MPI.COMM_WORLD.Barrier()
 
+    def testInterpKernelDEC_2D_py_3(self):
+        """ Test on a question that often comes back: should I re-synchronize() / re-attach() each time that
+        I want to send a new field? 
+        Basic answer: 
+          - you do not have to re-synchronize, but you can re-attach a new field, as long as it has the same support.
+        WARNING: this differs in OverlapDEC ...
+        """
+        size = MPI.COMM_WORLD.size
+        rank = MPI.COMM_WORLD.rank
+        if size != 4:
+            print("Should be run on 4 procs!")
+            return
+
+        # Define two processor groups
+        nproc_source = 2
+        procs_source = list(range(nproc_source))
+        procs_target = list(range(size - nproc_source, size))
+
+        interface = CommInterface()
+        source_group = MPIProcessorGroup(interface, procs_source)
+        target_group = MPIProcessorGroup(interface, procs_target)
+        idec = InterpKernelDEC(source_group, target_group)
+
+        MPI.COMM_WORLD.Barrier()  # really necessary??
+
+        #
+        # OK, let's go DEC !!
+        #
+        mul = 1
+        for t in range(3):  # Emulating a time loop for example
+            if source_group.containsMyRank():
+                _, fieldS = self.getPartialSource(rank)
+                fieldS.setNature(IntensiveMaximum)   # The only policy supported for now ...
+                fS2 = fieldS.deepCopy()
+                fS2.setMesh(fieldS.getMesh())
+                idec.attachLocalField(fS2)         # each time, but same support!
+                if t == 0:
+                    idec.synchronize()             # only once!
+                das = fS2.getArray()
+                das *= t+1
+                idec.sendData()                    # each time!
+
+            if target_group.containsMyRank():
+                mshT, fieldT = self.getPartialTarget(rank)
+                fieldT.setNature(IntensiveMaximum)
+                fT2 = fieldT.deepCopy()
+                fT2.setMesh(fieldT.getMesh())
+                idec.attachLocalField(fT2)          # each time, but same support!
+                if t == 0:
+                    idec.synchronize()              # only once!
+                idec.recvData()                     # each time!
+                # Now the actual checks:
+                mul = t+1
+                if rank == 2:
+                    self.assertEqual(fT2.getArray().getValues(), [1.0*mul, 9.0*mul])
+                elif rank == 3:
+                    self.assertEqual(fT2.getArray().getValues(), [5.0*mul, 13.0*mul])
+
+        # Release DEC (this involves MPI exchanges -- notably the release of the communicator -- so better be done before MPI.Finalize()
+        idec.release()
+        source_group.release()
+        target_group.release()
+        MPI.COMM_WORLD.Barrier()
+
     def test_InterpKernelDEC_default(self):
         """
         [EDF27375] : Put a default value when non intersecting case
@@ -307,7 +371,7 @@ class ParaMEDMEM_IK_DEC_Tests(unittest.TestCase):
             if rank == 2:
                 # matrix S0 / T2 = [[(0,S0,1),(1,S0,1.5)]
                 # IntensiveMaximum => [[(0,S0,1/2.5),(1,S0,1.5/2.5)]
-                # 
+                #
                 self.assertTrue(field.getArray().isEqual(DataArrayDouble([0.6]),1e-12))
                 self.assertTrue( dec.retrieveNonFetchedIds().isEqual(DataArrayInt([])) )
             if rank == 3:
@@ -360,4 +424,3 @@ def Target_Proc_3():
 if __name__ == "__main__":
     unittest.main()
     MPI.Finalize()
-
diff --git a/src/ParaMEDMEM_Swig/test_OverlapDEC.py b/src/ParaMEDMEM_Swig/test_OverlapDEC.py
index 1261c21a4..4417ef3a8 100755
--- a/src/ParaMEDMEM_Swig/test_OverlapDEC.py
+++ b/src/ParaMEDMEM_Swig/test_OverlapDEC.py
@@ -37,16 +37,19 @@ class ParaMEDMEM_O_DEC_Tests(unittest.TestCase):
     Main method is testOverlapDEC_2D_py_1()
     """
 
-    def generateFullSource(self):
+    def generateFullSource(self, nb_compo=1):
         """ The complete source mesh: 4 squares each divided in 2 diagonaly (so 8 cells in total) """
         msh  = self.generateFullTarget()
         msh.simplexize(0)
         msh.setName("src_mesh")
         fld = MEDCouplingFieldDouble(ON_CELLS, ONE_TIME)
-        fld.setMesh(msh); fld.setName("source_F");
-        da = DataArrayDouble(msh.getNumberOfCells())
-        da.iota()
-        da *= 2
+        fld.setMesh(msh); fld.setName("source_F")
+        nc = msh.getNumberOfCells()
+        da = DataArrayDouble(nc*nb_compo)
+        da.rearrange(nb_compo)
+        for c in range(nb_compo):
+          da[:, c] = list(range(nc))
+        da *= 2     # To compensate for the later division of the volume by 2 betw target and source cells.
         fld.setArray(da)
         return msh, fld
 
@@ -61,11 +64,11 @@ class ParaMEDMEM_O_DEC_Tests(unittest.TestCase):
     #
     # Below, the two functions emulating the set up of a piece of the source and target mesh
     # on each proc. Obviously in real world problems, this comes from your code and is certainly
-    # not computed by cuting again from scratch the full-size mesh!!
+    # not computed by cutting again from scratch the full-size mesh!!
     #
-    def getPartialSource(self, rank):
+    def getPartialSource(self, rank, nb_compo=1):
         """ Will return an empty mesh piece for rank=2 and 3 """
-        msh, f = self.generateFullSource()
+        msh, f = self.generateFullSource(nb_compo)
         if rank in [2,3]:
             sub_m, sub_f = msh[[]], f[[]]  # Little trick to select nothing in the mesh, thus producing an empty mesh
         elif rank == 0:
@@ -75,14 +78,14 @@ class ParaMEDMEM_O_DEC_Tests(unittest.TestCase):
         sub_m.zipCoords()
         return sub_m, sub_f
 
-    def getPartialTarget(self, rank):
+    def getPartialTarget(self, rank, nb_compo=1):
         """ One square for each rank """
         msh = self.generateFullTarget()
         sub_m = msh[rank]
         sub_m.zipCoords()
         # Receiving side must prepare an empty field that will be filled by DEC:
         fld = MEDCouplingFieldDouble(ON_CELLS, ONE_TIME)
-        da = DataArrayDouble(sub_m.getNumberOfCells())
+        da = DataArrayDouble(sub_m.getNumberOfCells(), nb_compo)
         fld.setArray(da)
         fld.setName("tgt_F")
         fld.setMesh(sub_m)
@@ -109,7 +112,61 @@ class ParaMEDMEM_O_DEC_Tests(unittest.TestCase):
 
     @WriteInTmpDir
     def testOverlapDEC_2D_py_1(self):
-        """ The main method of the test """
+        """ The main method of the test. """
+        ncompo = 4   # Dummy field with 4 components
+        size = MPI.COMM_WORLD.size
+        rank = MPI.COMM_WORLD.rank
+        if size != 4:
+            raise RuntimeError("Should be run on 4 procs!")
+
+        for algo in range(3):
+            # Define (single) processor group - note the difference with InterpKernelDEC which needs two groups.
+            proc_group = list(range(size))   # No need for ProcessorGroup object here.
+            odec = OverlapDEC(proc_group)
+            odec.setWorkSharingAlgo(algo)    # Default is 1 - put here to test various different configurations of send/receive patterns
+
+            # Write out full size meshes/fields for inspection
+            if rank == 0:
+                _, fld = self.generateFullSource(ncompo)
+                mshT = self.generateFullTarget()
+                WriteField("./source_field_FULL.med", fld, True)
+                WriteUMesh("./target_mesh_FULL.med", mshT, True)
+
+            MPI.COMM_WORLD.Barrier()  # really necessary??
+
+            #
+            # OK, let's go DEC !!
+            #
+            _, fieldS = self.getPartialSource(rank, ncompo)
+            fieldS.setNature(IntensiveMaximum)   # The only policy supported for now ...
+            mshT, fieldT = self.getPartialTarget(rank, ncompo)
+            fieldT.setNature(IntensiveMaximum)
+            if rank not in [2,3]:
+                WriteField("./source_field_part_%d.med" % rank, fieldS, True)
+            WriteUMesh("./target_mesh_part_%d.med" % rank, mshT, True)
+
+            odec.attachSourceLocalField(fieldS)
+            odec.attachTargetLocalField(fieldT)
+            odec.synchronize()
+            odec.sendRecvData()
+
+            # Now the actual checks:
+            ref_vals = [1.0, 5.0, 9.0, 13.0]
+            self.assertEqual(fieldT.getArray().getValues(), [ref_vals[rank]]*ncompo)
+
+            # Release DEC (this involves MPI exchanges -- notably the release of the communicator -- so better be done before MPI.Finalize()
+            odec.release()
+
+            MPI.COMM_WORLD.Barrier()
+
+    @WriteInTmpDir
+    def testOverlapDEC_2D_py_2(self):
+        """ Test on a question that often comes back: should I re-synchronize() / re-attach() each time that
+        I want to send a new field? 
+        Basic answer: 
+          - you do not have to re-synchronize, but you can re-attach a new field, as long as it has the same support.
+        WARNING: this differs in InterpKernelDEC ...
+        """
         size = MPI.COMM_WORLD.size
         rank = MPI.COMM_WORLD.rank
         if size != 4:
@@ -119,13 +176,6 @@ class ParaMEDMEM_O_DEC_Tests(unittest.TestCase):
         proc_group = list(range(size))   # No need for ProcessorGroup object here.
         odec = OverlapDEC(proc_group)
 
-        # Write out full size meshes/fields for inspection
-        if rank == 0:
-            _, fld = self.generateFullSource()
-            mshT = self.generateFullTarget()
-            WriteField("./source_field_FULL.med", fld, True)
-            WriteUMesh("./target_mesh_FULL.med", mshT, True)
-
         MPI.COMM_WORLD.Barrier()  # really necessary??
 
         #
@@ -135,24 +185,22 @@ class ParaMEDMEM_O_DEC_Tests(unittest.TestCase):
         fieldS.setNature(IntensiveMaximum)   # The only policy supported for now ...
         mshT, fieldT = self.getPartialTarget(rank)
         fieldT.setNature(IntensiveMaximum)
-        if rank not in [2,3]:
-            WriteField("./source_field_part_%d.med" % rank, fieldS, True)
-        WriteUMesh("./target_mesh_part_%d.med" % rank, mshT, True)
-
-        odec.attachSourceLocalField(fieldS)
-        odec.attachTargetLocalField(fieldT)
-        odec.synchronize()
-        odec.sendRecvData()
-
-        # Now the actual checks:
-        if rank == 0:
-            self.assertEqual(fieldT.getArray().getValues(), [1.0])
-        elif rank == 1:
-            self.assertEqual(fieldT.getArray().getValues(), [5.0])
-        elif rank == 2:
-            self.assertEqual(fieldT.getArray().getValues(), [9.0])
-        elif rank == 3:
-            self.assertEqual(fieldT.getArray().getValues(), [13.0])
+
+        mul = 1
+        for t in range(3):  # Emulating a time loop ...
+            if t == 0:
+                odec.attachSourceLocalField(fieldS)   # only once!
+                odec.attachTargetLocalField(fieldT)   # only once!
+                odec.synchronize()                    # only once!
+            else:
+                das = fieldS.getArray()               # but we can still hack the underlying field values ...
+                das *= 2
+                mul *= 2
+            odec.sendRecvData()                       # each time!
+
+            # Now the actual checks:
+            ref_vals = [1.0, 5.0, 9.0, 13.0]
+            self.assertEqual(fieldT.getArray().getValues(), [ref_vals[rank]*mul])
 
         # Release DEC (this involves MPI exchanges -- notably the release of the communicator -- so better be done before MPI.Finalize()
         odec.release()
@@ -162,3 +210,5 @@ class ParaMEDMEM_O_DEC_Tests(unittest.TestCase):
 if __name__ == "__main__":
     unittest.main()
     MPI.Finalize()
+    # tt = ParaMEDMEM_O_DEC_Tests()
+    # tt.testOverlapDEC_2D_py_1()
-- 
2.39.2