]> SALOME platform Git repositories - modules/kernel.git/commitdiff
Salome HOME
[EDF30062] [EDF29150]: Additional fault tolerant mecanism
authorAnthony Geay <anthony.geay@edf.fr>
Fri, 31 May 2024 16:32:46 +0000 (18:32 +0200)
committerAnthony Geay <anthony.geay@edf.fr>
Wed, 5 Jun 2024 07:50:42 +0000 (09:50 +0200)
19 files changed:
idl/SALOME_Component.idl
idl/SALOME_ContainerManager.idl
src/Basics/KernelBasis.cxx
src/Basics/KernelBasis.hxx
src/Basics/KernelBasis.i
src/Container/CMakeLists.txt
src/Container/Container_i.cxx
src/Container/SALOME_Container.py
src/Container/SALOME_ContainerManager.cxx
src/Container/SALOME_ContainerManager.hxx
src/Container/SALOME_Container_No_NS_Serv.cxx
src/Container/SALOME_Container_No_NS_Serv_Generic.hxx
src/Container/SALOME_Container_No_NS_Serv_OutProcess.cxx
src/Container/SALOME_Container_No_NS_Serv_OutProcess_FT.cxx [new file with mode: 0644]
src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay.cxx
src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay_FT.cxx [new file with mode: 0644]
src/Container/SALOME_Container_i.hxx
src/Container/SALOME_PyNode.py
src/Launcher/Test/testCrashProofContainer.py

index 04ac5b3405307896fb21b49a67517b63bb95a22b..2d7d3ea5bd4971e20891685b1c5c2bee4c96b39a 100644 (file)
@@ -89,6 +89,8 @@ module Engines
     void set_big_obj_on_disk_threshold(in long thresholdInByte);
 
     void set_big_obj_on_disk_directory(in string directory);
+    
+    void set_number_of_retry(in long nbRetry);
 
     void addLogFileNameGroup(in vectorOfString groupOfLogFileNames);
     
index 32d9c6676ad412ad67bfd49d09a0f7c94710dd33..110c5b41988e8dc0d7b6fb32cdc0282434d711ff 100644 (file)
@@ -112,6 +112,10 @@ interface ContainerManager
 
   void SetBigObjOnDiskDirectory(in string directory);
 
+  void SetNumberOfRetry(in long nbRetry);
+
+  long GetNumberOfRetry();
+
   void SetCodeOnContainerStartUp(in string code);
 
   string GetCodeOnContainerStartUp();
index 649f51556c4ea488e9497bfe8113c5580f121057..c236f00206bd1da8866d2dce09878906b761435d 100644 (file)
@@ -81,6 +81,10 @@ namespace SALOME
   static constexpr char OUT_OF_PROCESS_NO_REPLAY_VALUE_STR[] = "OutOfProcessNoReplay";
   static constexpr char OUT_OF_PROCESS_WITH_REPLAY_VALUE = 2;
   static constexpr char OUT_OF_PROCESS_WITH_REPLAY_VALUE_STR[] = "OutOfProcessWithReplay";
+  static constexpr char OUT_OF_PROCESS_NO_REPLAY_FT_VALUE = 3;
+  static constexpr char OUT_OF_PROCESS_NO_REPLAY_FT_VALUE_STR[] = "OutOfProcessNoReplayFT";
+  static constexpr char OUT_OF_PROCESS_WITH_REPLAY_FT_VALUE = 4;
+  static constexpr char OUT_OF_PROCESS_WITH_REPLAY_FT_VALUE_STR[] = "OutOfProcessWithReplayFT";
 
   static PyExecutionMode FromIntToPyExecutionMode(char value)
   {
@@ -92,6 +96,10 @@ namespace SALOME
         return PyExecutionMode::OutOfProcessNoReplay;
       case OUT_OF_PROCESS_WITH_REPLAY_VALUE:
         return PyExecutionMode::OutOfProcessWithReplay;
+      case OUT_OF_PROCESS_NO_REPLAY_FT_VALUE:
+        return PyExecutionMode::OutOfProcessNoReplayFT;
+      case OUT_OF_PROCESS_WITH_REPLAY_FT_VALUE:
+        return PyExecutionMode::OutOfProcessWithReplayFT;
     }
     throw std::range_error("FromIntToPyExecutionMode : Invalid value for Py Execution Mode ! Must be in 0 (InProcess), 1 (OutOfProcessNoReplay) or 2 (OutOfProcessWithReplay) !");
   }
@@ -104,6 +112,10 @@ namespace SALOME
       return PyExecutionMode::OutOfProcessNoReplay;
     if(value == OUT_OF_PROCESS_WITH_REPLAY_VALUE_STR)
       return PyExecutionMode::OutOfProcessWithReplay;
+    if(value == OUT_OF_PROCESS_NO_REPLAY_FT_VALUE_STR)
+      return PyExecutionMode::OutOfProcessNoReplayFT;
+    if(value == OUT_OF_PROCESS_WITH_REPLAY_FT_VALUE_STR)
+      return PyExecutionMode::OutOfProcessWithReplayFT;
     throw std::range_error("FromStrToPyExecutionMode : Invalid str value for py execution mode !");
   }
 
@@ -117,6 +129,10 @@ namespace SALOME
         return OUT_OF_PROCESS_NO_REPLAY_VALUE_STR;
       case PyExecutionMode::OutOfProcessWithReplay:
         return OUT_OF_PROCESS_WITH_REPLAY_VALUE_STR;
+      case PyExecutionMode::OutOfProcessNoReplayFT:
+        return OUT_OF_PROCESS_NO_REPLAY_FT_VALUE_STR;
+      case PyExecutionMode::OutOfProcessWithReplayFT:
+        return OUT_OF_PROCESS_WITH_REPLAY_FT_VALUE_STR;
       default:
         throw std::range_error("FromExecutionModeToStr : Invalid str value for py execution mode !");
     }
@@ -139,6 +155,10 @@ void SALOME::SetBigObjOnDiskThreshold(int newThresholdInByte)
 
 static std::string SALOME_FILE_BIG_OBJ_DIR;
 
+constexpr int DFT_SALOME_NB_RETRY = 1;
+
+static int SALOME_NB_RETRY = DFT_SALOME_NB_RETRY;
+
 std::string SALOME::GetBigObjOnDiskDirectory()
 {
   return SALOME_FILE_BIG_OBJ_DIR;
@@ -154,6 +174,16 @@ bool SALOME::BigObjOnDiskDirectoryDefined()
   return ! SALOME_FILE_BIG_OBJ_DIR.empty();
 }
 
+void SALOME::SetNumberOfRetry(int nbRetry)
+{
+  SALOME_NB_RETRY = nbRetry;
+}
+
+int SALOME::GetNumberOfRetry()
+{
+  return SALOME_NB_RETRY;
+}
+
 static SALOME::PyExecutionMode DefaultPyExecMode = SALOME::PyExecutionMode::NotSet;
 
 void SALOME::SetPyExecutionMode(PyExecutionMode mode)
index cbecfe72e1940b7e4329e87c913ac70bd3b2fb86..d2a7bb222030a0fc634afe81cc95c05df27e7f17 100644 (file)
@@ -37,7 +37,7 @@ void BASICS_EXPORT WriteInStderr(const std::string& msg);
 
 namespace SALOME
 {
-  enum class PyExecutionMode { NotSet, InProcess, OutOfProcessNoReplay, OutOfProcessWithReplay };
+  enum class PyExecutionMode { NotSet, InProcess, OutOfProcessNoReplay, OutOfProcessWithReplay, OutOfProcessNoReplayFT, OutOfProcessWithReplayFT };
   void BASICS_EXPORT SetPyExecutionMode(PyExecutionMode mode);
   void BASICS_EXPORT SetPyExecutionModeStr(const std::string& mode);
   std::vector<std::string> BASICS_EXPORT GetAllPyExecutionModes();
@@ -48,4 +48,6 @@ namespace SALOME
   std::string BASICS_EXPORT GetBigObjOnDiskDirectory();
   void BASICS_EXPORT SetBigObjOnDiskDirectory(const std::string& directory);
   bool BASICS_EXPORT BigObjOnDiskDirectoryDefined();
+  void BASICS_EXPORT SetNumberOfRetry(int nbRetry);
+  int BASICS_EXPORT GetNumberOfRetry();
 }
index 092c7537ff72bcd58b0ab9c7298658fa7ada0aa2..6c5e8536f6e805312a21cd9e217d0f28c90665dd 100644 (file)
@@ -56,6 +56,8 @@ using namespace SALOME;
 %rename (GetBigObjOnDiskDirectory) GetBigObjOnDiskDirectorySwig;
 %rename (SetBigObjOnDiskDirectory) SetBigObjOnDiskDirectorySwig;
 %rename (BigObjOnDiskDirectoryDefined) BigObjOnDiskDirectoryDefinedSwig;
+%rename (SetNumberOfRetry) SetNumberOfRetrySwig;
+%rename (GetNumberOfRetry) GetNumberOfRetrySwig;
 
 bool getSSLMode();
 void setSSLMode(bool sslMode);
@@ -142,6 +144,16 @@ bool BigObjOnDiskDirectoryDefinedSwig()
   return SALOME::BigObjOnDiskDirectoryDefined();
 }
 
+void SetNumberOfRetrySwig(int nbRetry)
+{
+  SALOME::SetNumberOfRetry( nbRetry );
+}
+
+int GetNumberOfRetrySwig()
+{
+  return SALOME::GetNumberOfRetry( );
+}
+
 void SetVerbosityLevelSwig(const std::string& level)
 {
   SetVerbosityLevelStr(level);
index 4eb507ecca731a536b265ff07cc033e0c298029f..e61ebbb214b8e43b66372bf7fd77df45086d34ba 100644 (file)
@@ -119,11 +119,17 @@ TARGET_LINK_LIBRARIES(SALOME_Container_No_NS_Serv_OutProcess SalomeContainerServ
 ADD_EXECUTABLE(SALOME_Container_No_NS_Serv_OutProcess_Replay SALOME_Container_No_NS_Serv_OutProcess_Replay.cxx)
 TARGET_LINK_LIBRARIES(SALOME_Container_No_NS_Serv_OutProcess_Replay SalomeContainerServer)
 
+ADD_EXECUTABLE(SALOME_Container_No_NS_Serv_OutProcess_FT SALOME_Container_No_NS_Serv_OutProcess_FT.cxx)
+TARGET_LINK_LIBRARIES(SALOME_Container_No_NS_Serv_OutProcess_FT SalomeContainerServer)
+
+ADD_EXECUTABLE(SALOME_Container_No_NS_Serv_OutProcess_Replay_FT SALOME_Container_No_NS_Serv_OutProcess_Replay_FT.cxx)
+TARGET_LINK_LIBRARIES(SALOME_Container_No_NS_Serv_OutProcess_Replay_FT SalomeContainerServer)
+
 IF(SALOME_BUILD_TESTS)
   ADD_EXECUTABLE(TestSalome_file TestSalome_file.cxx)
   TARGET_LINK_LIBRARIES(TestSalome_file SALOMETraceCollectorTest ${SALOME_Container_LIBS})
 ENDIF()
-INSTALL(TARGETS SALOME_Container SALOME_Container_No_NS_Serv SALOME_Container_No_NS_Serv_OutProcess SALOME_Container_No_NS_Serv_OutProcess_Replay DESTINATION ${SALOME_INSTALL_BINS})
+INSTALL(TARGETS SALOME_Container SALOME_Container_No_NS_Serv SALOME_Container_No_NS_Serv_OutProcess SALOME_Container_No_NS_Serv_OutProcess_Replay SALOME_Container_No_NS_Serv_OutProcess_FT SALOME_Container_No_NS_Serv_OutProcess_Replay_FT DESTINATION ${SALOME_INSTALL_BINS})
 
 # Executable scripts to be installed
 SALOME_INSTALL_SCRIPTS("${SCRIPTS}" ${SALOME_INSTALL_SCRIPT_PYTHON})
index a12edd4ff553546ffd4532e47d0f6a4bf43ed9aa..4e0d5fe93fd0bd565887c1934bec78595da94a12 100644 (file)
@@ -1180,6 +1180,11 @@ void Abstract_Engines_Container_i::set_big_obj_on_disk_directory(const char *dir
   SALOME::SetBigObjOnDiskDirectory(directory);
 }
 
+void Abstract_Engines_Container_i::set_number_of_retry(CORBA::Long nbRetry)
+{
+  SALOME::SetNumberOfRetry( nbRetry );
+}
+
 Engines::vectorOfString_var FromVecStringCppToCORBA( const std::vector<std::string>& group)
 {
   Engines::vectorOfString_var ret( new Engines::vectorOfString );
index bd267d4dc92d570538d01735be4e960ba10cf386..a7d2f83c58e483c98aa9e41399f4b1da022cfa54 100644 (file)
@@ -223,3 +223,17 @@ class SALOME_Container_OutOfProcess_Replay_i(SALOME_Container_i):
 
     def getPyScriptCls(self):
       return SALOME_PyNode.PyScriptNode_OutOfProcess_Replay_i
+
+class SALOME_Container_OutOfProcess_FT_i(SALOME_Container_i):
+    def __init__(self, containerName, containerIORStr, dftTimeIntervalInMs):
+      super().__init__(containerName, containerIORStr, dftTimeIntervalInMs)
+      
+    def getPyScriptCls(self):
+      return SALOME_PyNode.PyScriptNode_OutOfProcess_FT_i
+
+class SALOME_Container_OutOfProcess_Replay_FT_i(SALOME_Container_i):
+    def __init__(self, containerName, containerIORStr, dftTimeIntervalInMs):
+      super().__init__(containerName, containerIORStr, dftTimeIntervalInMs)
+
+    def getPyScriptCls(self):
+      return SALOME_PyNode.PyScriptNode_OutOfProcess_Replay_FT_i
index be55b1bf36d3108a990991905938c5b1e320a419..2ee5639bce06ee691dc02b32a942cfa40fb8ed75 100644 (file)
@@ -246,6 +246,16 @@ void SALOME_ContainerManager::SetBigObjOnDiskDirectory(const char *directory)
   SALOME::SetBigObjOnDiskDirectory(directory);
 }
 
+ void SALOME_ContainerManager::SetNumberOfRetry(CORBA::Long nbRetry)
+ {
+    SALOME::SetNumberOfRetry( nbRetry );
+ }
+
+CORBA::Long SALOME_ContainerManager::GetNumberOfRetry()
+{
+  return SALOME::GetNumberOfRetry();
+}
+
 //=============================================================================
 //! Loop on all the containers listed in naming service, ask shutdown on each
 /*! CORBA Method:
@@ -541,6 +551,7 @@ Engines::Container_ptr SALOME_ContainerManager::GiveContainer(const Engines::Con
         INFOS("[GiveContainer] container " << containerNameInNS << " override " << envInfo.str());
         cont->set_big_obj_on_disk_directory( SALOME::GetBigObjOnDiskDirectory().c_str() );
         cont->set_big_obj_on_disk_threshold( SALOME::GetBigObjOnDiskThreshold() );
+        cont->set_number_of_retry( SALOME::GetNumberOfRetry() );
         Engines::FieldsDict envCorba;
         {
           auto sz = _override_env.size();
@@ -586,6 +597,10 @@ std::string SALOME_ContainerManager::GetCppBinaryOfKernelSSLContainer() const
       return "SALOME_Container_No_NS_Serv_OutProcess";
     case SALOME::PyExecutionMode::OutOfProcessWithReplay:
       return "SALOME_Container_No_NS_Serv_OutProcess_Replay";
+    case SALOME::PyExecutionMode::OutOfProcessNoReplayFT:
+      return "SALOME_Container_No_NS_Serv_OutProcess_FT";
+    case SALOME::PyExecutionMode::OutOfProcessWithReplayFT:
+      return "SALOME_Container_No_NS_Serv_OutProcess_Replay_FT";
     default:
       {
         ERROR_MESSAGE("Not manager py execution mode");
index 6119ec393ac3299f073a9b2211a11ea8b7d4ace8..a02bd92587fb00eb25be733ab2182662e43d331f 100644 (file)
@@ -83,6 +83,10 @@ public:
 
   void SetBigObjOnDiskDirectory(const char *directory) override;
 
+  void SetNumberOfRetry(CORBA::Long nbRetry) override;
+
+  CORBA::Long GetNumberOfRetry() override;
+
   static const char *_ContainerManagerNameInNS;
 
 private:
index f7df00250ab731d6820e0ceb2133804f4237f62c..097663605066b003b3a48a7cf274bfcba23da52e 100644 (file)
@@ -19,4 +19,7 @@
 
 #include "SALOME_Container_No_NS_Serv_Generic.hxx"
 
-GENERIC_CONTAINER_EXECUTABLE( Engines_Container_SSL_i )
+int main(int argc, char* argv[])
+{
+  return GenericContainerExecutable<Engines_Container_SSL_i>(argc,argv);
+}
index d5bc1e7f62116f578eecd7fb85a8c2c487d4c1d9..bd4932a2f4bf29d279d29271c797806e7363f537 100644 (file)
 #include "SALOME_KernelORB.hxx"
 #include "KernelBasis.hxx"
 
-#define GENERIC_CONTAINER_EXECUTABLE( cls )                                                                                                              \
-int main(int argc, char* argv[])                                                                                                                         \
-{                                                                                                                                                        \
-  if(argc<3)                                                                                                                                             \
-    THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : requires 2 input arguments <containerName> <IOR of Engines::EmbeddedNamingService>" );        \
-  CORBA::ORB_ptr orb(KERNEL::getORB());                                                                                                                  \
-  std::string IOROfEmbeddedNamingService(argv[2]);                                                                                                       \
-  setIOROfEmbeddedNS(IOROfEmbeddedNamingService);                                                                                                        \
-  CORBA::Object_var ns_serv_obj_base = orb->string_to_object(IOROfEmbeddedNamingService.c_str());                                                        \
-  if( CORBA::is_nil(ns_serv_obj_base) )                                                                                                                  \
-    THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : argument 2 is NOT a valid IOR" );                                                             \
-  Engines::EmbeddedNamingService_var ns_serv_obj = Engines::EmbeddedNamingService::_narrow(ns_serv_obj_base);                                            \
-  if( CORBA::is_nil(ns_serv_obj) )                                                                                                                       \
-    THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : argument 2 is NOT a valid IOR of Engines::EmbeddedNamingService" );                           \
-  std::unique_ptr<SALOME_NamingService_Container_Abstract> ns( new SALOME_Embedded_NamingService_Client(ns_serv_obj) );                                  \
-  return container_common_main<cls>(argc,argv,std::move(ns));                                                                                            \
+template<class CLS>
+int GenericContainerExecutable(int argc, char* argv[])                                                                                                                 
+{                                                                                                                                                
+  if(argc<3)
+    THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : requires 2 input arguments <containerName> <IOR of Engines::EmbeddedNamingService>" );
+  CORBA::ORB_ptr orb(KERNEL::getORB());
+  std::string IOROfEmbeddedNamingService(argv[2]);
+  setIOROfEmbeddedNS(IOROfEmbeddedNamingService);
+  CORBA::Object_var ns_serv_obj_base = orb->string_to_object(IOROfEmbeddedNamingService.c_str());
+  if( CORBA::is_nil(ns_serv_obj_base) )
+    THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : argument 2 is NOT a valid IOR" );
+  Engines::EmbeddedNamingService_var ns_serv_obj = Engines::EmbeddedNamingService::_narrow(ns_serv_obj_base);
+  if( CORBA::is_nil(ns_serv_obj) )
+    THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : argument 2 is NOT a valid IOR of Engines::EmbeddedNamingService" );
+  std::unique_ptr<SALOME_NamingService_Container_Abstract> ns( new SALOME_Embedded_NamingService_Client(ns_serv_obj) );                          
+  return container_common_main<CLS>(argc,argv,std::move(ns));
 }
index f535d073a7a2a52579d59e25cc05fa169d49a8e8..c1c02c784e2928c25fa2c8b3732b6de3d728badf 100644 (file)
@@ -19,4 +19,7 @@
 
 #include "SALOME_Container_No_NS_Serv_Generic.hxx"
 
-GENERIC_CONTAINER_EXECUTABLE( Engines_Container_SSL_OutOfProcess_i )
+int main(int argc, char* argv[])
+{
+  return GenericContainerExecutable<Engines_Container_SSL_OutOfProcess_i>(argc,argv);
+}
diff --git a/src/Container/SALOME_Container_No_NS_Serv_OutProcess_FT.cxx b/src/Container/SALOME_Container_No_NS_Serv_OutProcess_FT.cxx
new file mode 100644 (file)
index 0000000..fdc3d96
--- /dev/null
@@ -0,0 +1,25 @@
+// Copyright (C) 2021-2024  CEA, EDF
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+//
+// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
+//
+
+#include "SALOME_Container_No_NS_Serv_Generic.hxx"
+
+int main(int argc, char* argv[])
+{
+  return GenericContainerExecutable<Engines_Container_SSL_OutOfProcess_FT_i>(argc,argv);
+}
index 27a90f58ed3227f06416549034927a9ed7b90d31..be198b252a9e2db4f3da5cf1a7a868e7168d225a 100644 (file)
@@ -19,4 +19,7 @@
 
 #include "SALOME_Container_No_NS_Serv_Generic.hxx"
 
-GENERIC_CONTAINER_EXECUTABLE( Engines_Container_SSL_OutOfProcess_Replay_i )
+int main(int argc, char* argv[])
+{
+  return GenericContainerExecutable<Engines_Container_SSL_OutOfProcess_Replay_i>(argc,argv);
+}
diff --git a/src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay_FT.cxx b/src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay_FT.cxx
new file mode 100644 (file)
index 0000000..7568dea
--- /dev/null
@@ -0,0 +1,25 @@
+// Copyright (C) 2021-2024  CEA, EDF
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+//
+// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
+//
+
+#include "SALOME_Container_No_NS_Serv_Generic.hxx"
+
+int main(int argc, char* argv[])
+{
+  return GenericContainerExecutable<Engines_Container_SSL_OutOfProcess_Replay_FT_i>(argc,argv);
+}
index d929939f09b793951d8e4738951b4d22c4b337db..08c174254c09cff57123be2289ca9f97d7fc380f 100644 (file)
@@ -86,6 +86,8 @@ public:
 
   void set_big_obj_on_disk_directory(const char *directory) override;
 
+  void set_number_of_retry(CORBA::Long nbRetry) override;
+
   void addLogFileNameGroup(const Engines::vectorOfString& groupOfLogFileNames) override;
     
   Engines::vectorOfVectorOfString *getAllLogFileNameGroups() override;
@@ -223,7 +225,8 @@ protected:
 constexpr char PY_CONTAINER_CLS_NAME_IN_PROCESS[] = "SALOME_Container_i";
 constexpr char PY_CONTAINER_CLS_NAME_OUT_PROCESS_NO_REPLAY[] = "SALOME_Container_OutOfProcess_i";
 constexpr char PY_CONTAINER_CLS_NAME_OUT_PROCESS_WITH_REPLAY[] = "SALOME_Container_OutOfProcess_Replay_i";
-
+constexpr char PY_CONTAINER_CLS_NAME_OUT_PROCESS_NO_REPLAY_FT[] = "SALOME_Container_OutOfProcess_FT_i";
+constexpr char PY_CONTAINER_CLS_NAME_OUT_PROCESS_WITH_REPLAY_FT[] = "SALOME_Container_OutOfProcess_Replay_FT_i";
 
 class CONTAINER_EXPORT Engines_Container_i : public Abstract_Engines_Container_i
 {
@@ -289,6 +292,30 @@ public:
                           Abstract_Engines_Container_SSL_i(PY_CONTAINER_CLS_NAME_OUT_PROCESS_WITH_REPLAY, orb, poa, containerName, argc, argv, ns, isServantAloneInProcess) {}
 };
 
+class CONTAINER_EXPORT Engines_Container_SSL_OutOfProcess_FT_i : public Abstract_Engines_Container_SSL_i
+{
+public:
+  Engines_Container_SSL_OutOfProcess_FT_i(CORBA::ORB_ptr orb,
+                          PortableServer::POA_ptr poa,
+                          char *containerName,
+                          int argc, char *argv[],
+                          SALOME_NamingService_Container_Abstract *ns = nullptr,
+                          bool isServantAloneInProcess = true) :
+                          Abstract_Engines_Container_SSL_i(PY_CONTAINER_CLS_NAME_OUT_PROCESS_NO_REPLAY_FT, orb, poa, containerName, argc, argv, ns, isServantAloneInProcess) {}
+};
+
+class CONTAINER_EXPORT Engines_Container_SSL_OutOfProcess_Replay_FT_i : public Abstract_Engines_Container_SSL_i
+{
+public:
+  Engines_Container_SSL_OutOfProcess_Replay_FT_i(CORBA::ORB_ptr orb,
+                          PortableServer::POA_ptr poa,
+                          char *containerName,
+                          int argc, char *argv[],
+                          SALOME_NamingService_Container_Abstract *ns = nullptr,
+                          bool isServantAloneInProcess = true) :
+                          Abstract_Engines_Container_SSL_i(PY_CONTAINER_CLS_NAME_OUT_PROCESS_WITH_REPLAY_FT, orb, poa, containerName, argc, argv, ns, isServantAloneInProcess) {}
+};
+
 /*!
  * Methods to be used in SSL mode to skip NS.
  */
index 312c7bf5d94cd29f8273b7a491e2fb79d317816a..0b0c9b3cd66c07b02c19efa156cc3e8c481a715f 100644 (file)
@@ -40,6 +40,8 @@ MY_CONTAINER_ENTRY_IN_GLBS = "my_container"
 
 MY_PERFORMANCE_LOG_ENTRY_IN_GLBS = "my_log_4_this_session"
 
+MY_KEY_TO_DETECT_FINISH = "neib av tuot"
+
 class Generic(SALOME__POA.GenericObj):
   """A Python implementation of the GenericObj CORBA IDL"""
   def __init__(self,poa):
@@ -560,6 +562,9 @@ class GenericPythonMonitoringLauncherCtxMgr:
     
     def __exit__(self,exctype, exc, tb):
         StopMonitoring( self._monitoring_params )
+        del self._monitoring_params
+        import gc
+        gc.collect() # force destruction of objects even in raise context
 
 def StopMonitoring( monitoringInfo ):
   """
@@ -730,6 +735,9 @@ with open(inputFileName,"rb") as f:
 context[MY_PERFORMANCE_LOG_ENTRY_IN_GLBS] = eval( MY_PERFORMANCE_LOG_ENTRY_IN_GLBS )
 with open(codeFileName,"r") as f:
   code = f.read()
+#
+import gc
+gc.disable()
 # go for execution
 exec( code , context )
 # filter part of context to be exported to father process
@@ -792,7 +800,7 @@ Looks like a hard crash as returnCode {returnCode} != 0
 {banner}
 """
 
-def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, keepFilesToReplay ):
+def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, keepFilesToReplay, closeEyesOnErrorAtExit):
   """
   Equivalent of exec(code,context) but executed in a separate subprocess to avoid to make the current process crash.
   
@@ -805,6 +813,7 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL
   containerRef (Engines.Container) : Container ref (retrieving the Files to created when keepFilesToReplay is set to False)
   instanceOfLogOfCurrentSession (LogOfCurrentExecutionSession) : instance of LogOfCurrentExecutionSession to build remotely the reference in order to log information
   keepFilesToReplay (bool) : if True when something goes wrong during execution all the files to replay post mortem case are kept. If False only error is reported but files to replay are destoyed.
+  closeEyesOnErrorAtExit (bool) : if True in case of crash of subprocess, if MY_KEY_TO_DETECT_FINISH is displayed at the end of stdout
 
   Return:
   -------
@@ -820,8 +829,22 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL
   import pickle
   import subprocess as sp
   import CORBA
+  #
+  def IsConsideredAsOKRun( returnCode, closeEyesOnErrorAtExit , stderr ):
+    def StdErrTreatment(closeEyesOnErrorAtExit , stderr):
+      if not closeEyesOnErrorAtExit:
+        return stderr
+      else:
+        return stderr[:-len(MY_KEY_TO_DETECT_FINISH)]
+    if returnCode == 0:
+      return True,StdErrTreatment(closeEyesOnErrorAtExit , stderr)
+    if not closeEyesOnErrorAtExit:
+      return False, stderr
+    return stderr[-len(MY_KEY_TO_DETECT_FINISH):] == MY_KEY_TO_DETECT_FINISH,stderr[:-len(MY_KEY_TO_DETECT_FINISH)]
+
   #
   def InternalExecResistant( code, context, outargsname):
+    import KernelBasis
     orb = CORBA.ORB_init([''])
     iorScriptLog = orb.object_to_string( instanceOfLogOfCurrentSession._remote_handle )#ref ContainerScriptPerfLog_ptr
     ####
@@ -830,6 +853,11 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL
       return os.path.splitext( os.path.basename(fname)[len(EXEC_CODE_FNAME_PXF):] )[0]
     with tempfile.NamedTemporaryFile(dir=os.getcwd(),prefix=EXEC_CODE_FNAME_PXF,suffix=".py", mode="w", delete = False) as codeFd:
       codeFd.write( code )
+      if closeEyesOnErrorAtExit:
+        codeFd.write( """
+import sys
+sys.stderr.write({!r})
+sys.stderr.flush()""".format( MY_KEY_TO_DETECT_FINISH ) )
       codeFd.flush()
       codeFileName = os.path.basename( codeFd.name )
       contextFileName = "contextsafe_{}.pckl".format( RetrieveUniquePartFromPfx( codeFileName  ) )
@@ -839,24 +867,32 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL
       mainExecFileName = os.path.abspath( "mainexecsafe_{}.py".format( RetrieveUniquePartFromPfx( codeFileName  ) ) )
       with open(mainExecFileName,"w") as f:
         f.write( FinalCode.format( codeFileName, contextFileName, resFileName, outargsname, iorScriptLog ) )
-      p = sp.Popen(["python3", mainExecFileName],stdout = sp.PIPE, stderr = sp.PIPE)
-      stdout, stderr = p.communicate()
-      returnCode = p.returncode
+      for iTry in range( KernelBasis.GetNumberOfRetry() ):
+        if iTry > 0:
+          print( "WARNING : Retry # {}. Following code has generated non zero return code ( {} ). Trying again ... \n{}".format( iTry, returnCode, code ) )
+        p = sp.Popen(["python3", mainExecFileName],stdout = sp.PIPE, stderr = sp.PIPE)
+        stdout, stderr = p.communicate()
+        returnCode = p.returncode
+        if returnCode == 0:
+          break
     return returnCode, stdout, stderr, PythonFunctionEvaluatorParams(mainExecFileName,codeFileName,contextFileName,resFileName)
   ret = instanceOfLogOfCurrentSession._current_instance
   returnCode, stdout, stderr, evParams = InternalExecResistant( code, context, outargsname )
   stdout = stdout.decode()
   stderr = stderr.decode()
   sys.stdout.write( stdout ) ; sys.stdout.flush()
+  isOK, stderr = IsConsideredAsOKRun( returnCode, closeEyesOnErrorAtExit , stderr )
   sys.stderr.write( stderr ) ; sys.stderr.flush()
-  if returnCode == 0:
+  if isOK:
     pcklData = instanceOfLogOfCurrentSession._remote_handle.getObj()
     if len(pcklData) > 0:
       ret = pickle.loads( pcklData )
     context.update( evParams.result )
     evParams.destroyOnOK()
+    if returnCode != 0:
+      print( "WARNING : Following code has generated non zero return code ( {} ) but considered as OK\n{}".format( returnCode, code ) )
     return ret
-  if returnCode != 0:
+  else:
     if keepFilesToReplay:
       evParams.destroyOnKO( containerRef )
     else:
@@ -864,10 +900,16 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL
     raise RuntimeError(f"Subprocess launched {evParams.strDependingOnReturnCode(keepFilesToReplay,returnCode)}stdout :\n{stdout}\nstderr :\n{stderr}")
 
 def ExecCrashProofWithReplay( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ):
-  return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, True)
+  return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, True, False)
 
 def ExecCrashProofWithoutReplay( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ):
-  return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, False)
+  return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, False, False)
+
+def ExecCrashProofWithReplayFT( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ):
+  return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, True, True)
+
+def ExecCrashProofWithoutReplayFT( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ):
+  return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, False, True)
 
 def ExecLocal( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ):
   exec( code, context )
@@ -894,7 +936,13 @@ class LogOfCurrentExecutionSession(LogOfCurrentExecutionSessionAbs):
     self.finalizeAndPushToMaster()
 
   def finalizeAndPushToMaster(self):
-    self._remote_handle.assign( pickle.dumps( self._current_instance ) )
+    """
+    Voluntary do nothing in case of problem to avoid to trouble execution
+    """
+    try:
+      self._remote_handle.assign( pickle.dumps( self._current_instance ) )
+    except:
+      pass
 
 class LogOfCurrentExecutionSessionStub(LogOfCurrentExecutionSessionAbs):
   """
@@ -1019,17 +1067,20 @@ class PyScriptNode_Abstract_i(Engines__POA.PyScriptNode,Generic,abc.ABC):
 
   def executeSecond(self,outargsname):
     """ Same than second part of self.execute to reduce memory peak."""
+    def executeSecondInternal(monitoringtimeresms):
+      with GenericPythonMonitoringLauncherCtxMgr( CPUMemoryMonitoring( monitoringtimeresms ) ) as monitoringParams:
+        currentInstance = self.executeNow( outargsname )
+        cpumeminfo = ReadCPUMemInfo( monitoringParams )
+      return cpumeminfo, currentInstance
+
     import sys
     try:
       self.addTimeInfoOnLevel2("startExecTime")
       ##
       self.addInfoOnLevel2("measureTimeResolution",self.my_container_py.monitoringtimeresms())
-      with GenericPythonMonitoringLauncherCtxMgr( CPUMemoryMonitoring( self.my_container_py.monitoringtimeresms() ) ) as monitoringParams:
-        self._current_execution_session._current_instance = self.executeNow( outargsname )
-        cpumeminfo = ReadCPUMemInfo( monitoringParams )
+      cpumeminfo, self._current_execution_session._current_instance = executeSecondInternal( self.my_container_py.monitoringtimeresms() )
       ##
       self.addInfoOnLevel2("CPUMemDuringExec",cpumeminfo)
-      del monitoringParams
       self.addTimeInfoOnLevel2("endExecTime")
       self.addTimeInfoOnLevel2("startOutputTime")
       argsout=[]
@@ -1130,3 +1181,17 @@ class PyScriptNode_OutOfProcess_Replay_i(PyScriptNode_Abstract_i):
 
   def executeNow(self, outargsname):
     return ExecCrashProofWithReplay(self.code,self.context,outargsname,self.my_container,self._current_execution_session)
+
+class PyScriptNode_OutOfProcess_FT_i(PyScriptNode_Abstract_i):
+  def __init__(self, nodeName, code, poa, my_container, logscript):
+    super().__init__(nodeName, code, poa, my_container, logscript)
+
+  def executeNow(self, outargsname):
+    return ExecCrashProofWithoutReplayFT(self.code,self.context,outargsname,self.my_container,self._current_execution_session)
+
+class PyScriptNode_OutOfProcess_Replay_FT_i(PyScriptNode_Abstract_i):
+  def __init__(self, nodeName, code, poa, my_container, logscript):
+    super().__init__(nodeName, code, poa, my_container, logscript)
+
+  def executeNow(self, outargsname):
+    return ExecCrashProofWithReplayFT(self.code,self.context,outargsname,self.my_container,self._current_execution_session)
index e8b7a973002f0666a8e1a17bb13466135d8897d7..0d2a168fe91a1e232aa2d770b4991ed398602ad6 100644 (file)
@@ -37,6 +37,7 @@ import subprocess as sp
 killMeCode = """
 import os
 import sys
+import signal
 j = 7 * i
 sys.stdout.write(str(j)) ; sys.stdout.flush() # the aime of test in replay mode to be sure that case is runnable
 os.kill( os.getpid() , signal.SIGKILL)# the aim of test is here
@@ -53,6 +54,19 @@ cst = KernelBasis.GetTimeAdjustmentCst()
 KernelBasis.HeatMarcel(5 * nbcore * cst,nbcore)
 j = 8*i"""
 
+killMeAtTheEnd = """import atexit
+import KernelServices
+
+def ErrorAtexit():
+    KernelServices.GenerateViolentMemoryFaultForTestPurpose()
+
+atexit.register(ErrorAtexit)
+
+print("OKKKKKK")
+j = 9 * i
+print("OKKKKKK3333")
+"""
+
 class testPerfLogManager1(unittest.TestCase):
     def test0(self):
         """
@@ -114,7 +128,7 @@ class testPerfLogManager1(unittest.TestCase):
         # now try to replay the failing case
         p = sp.Popen(["python3",os.path.basename(replayInput[0])],cwd = os.path.dirname(replayInput[0]),stdout=sp.PIPE,stderr=sp.PIPE)
         out,err = p.communicate()
-        self.assertEqual(1,p.returncode) # very important ! The failing case must continue to fail :)
+        self.assertNotEqual(p.returncode,0) # very important ! The failing case must continue to fail :)
         self.assertEqual("21".encode(),out) # very important to check that the reported case is standalone enough to be replayable poste mortem
         # cleanup
         dn = os.path.dirname(replayInput[0])
@@ -156,6 +170,31 @@ class testPerfLogManager1(unittest.TestCase):
         self.assertGreater(len(greater_than_100),1) # At minimum one measure must report CPU load > 100%
         cont.Shutdown()
 
+    def test3(self):
+        """
+        [EDF29150] : test that we can resist to a crash at exit
+        """
+        salome.salome_init()
+        KernelBasis.SetPyExecutionMode("OutOfProcessWithReplayFT")
+        hostname = "localhost"
+        cp = pylauncher.GetRequestForGiveContainer(hostname,"container_crash_test")
+        salome.cm.SetNumberOfRetry( 3 )
+        salome.cm.SetBigObjOnDiskThreshold(1000)
+        salome.cm.SetOverrideEnvForContainersSimple(env = [])
+        cont = salome.cm.GiveContainer(cp)
+        poa = salome.orb.resolve_initial_references("RootPOA")
+        obj = SALOME_PyNode.SenderByte_i(poa,pickle.dumps( (["i"],{"i": 3} ) )) ; id_o = poa.activate_object(obj) ; refPtr = poa.id_to_reference(id_o)
+        pyscript = cont.createPyScriptNode("testScript4",killMeAtTheEnd)
+        pyscript.executeFirst(refPtr)
+        ret = pyscript.executeSecond(["j"])
+        ret = pickle.loads( SALOME_PyNode.SeqByteReceiver(ret[0]).data() )
+        self.assertEqual(ret,27)
+        with open(cont.locallogfilename) as f:
+            logCont = f.read( )
+            self.assertTrue( "WARNING : Retry #" in logCont)
+            self.assertTrue( "WARNING : Following code has generated non zero return code" in logCont )# should report something into the container
+        cont.Shutdown()
+
 if __name__ == '__main__':
     from salome_utils import positionVerbosityOfLoggerRegardingState,setVerboseLevel,setVerbose
     salome.standalone()