]> SALOME platform Git repositories - modules/kernel.git/commitdiff
Salome HOME
[EDF29150] : Put additionnal fault tolerant mecanism agy/30062_2
authorAnthony Geay <anthony.geay@edf.fr>
Wed, 5 Jun 2024 07:48:18 +0000 (09:48 +0200)
committerAnthony Geay <anthony.geay@edf.fr>
Wed, 5 Jun 2024 07:48:18 +0000 (09:48 +0200)
idl/SALOME_Component.idl
idl/SALOME_ContainerManager.idl
src/Basics/KernelBasis.cxx
src/Basics/KernelBasis.hxx
src/Basics/KernelBasis.i
src/Container/Container_i.cxx
src/Container/SALOME_ContainerManager.cxx
src/Container/SALOME_ContainerManager.hxx
src/Container/SALOME_Container_i.hxx
src/Container/SALOME_PyNode.py
src/Launcher/Test/testCrashProofContainer.py

index 04ac5b3405307896fb21b49a67517b63bb95a22b..2d7d3ea5bd4971e20891685b1c5c2bee4c96b39a 100644 (file)
@@ -89,6 +89,8 @@ module Engines
     void set_big_obj_on_disk_threshold(in long thresholdInByte);
 
     void set_big_obj_on_disk_directory(in string directory);
+    
+    void set_number_of_retry(in long nbRetry);
 
     void addLogFileNameGroup(in vectorOfString groupOfLogFileNames);
     
index 32d9c6676ad412ad67bfd49d09a0f7c94710dd33..110c5b41988e8dc0d7b6fb32cdc0282434d711ff 100644 (file)
@@ -112,6 +112,10 @@ interface ContainerManager
 
   void SetBigObjOnDiskDirectory(in string directory);
 
+  void SetNumberOfRetry(in long nbRetry);
+
+  long GetNumberOfRetry();
+
   void SetCodeOnContainerStartUp(in string code);
 
   string GetCodeOnContainerStartUp();
index 5789721adfeeb51f8cb44c94211a644c8ecf8e8d..c236f00206bd1da8866d2dce09878906b761435d 100644 (file)
@@ -155,6 +155,10 @@ void SALOME::SetBigObjOnDiskThreshold(int newThresholdInByte)
 
 static std::string SALOME_FILE_BIG_OBJ_DIR;
 
+constexpr int DFT_SALOME_NB_RETRY = 1;
+
+static int SALOME_NB_RETRY = DFT_SALOME_NB_RETRY;
+
 std::string SALOME::GetBigObjOnDiskDirectory()
 {
   return SALOME_FILE_BIG_OBJ_DIR;
@@ -170,6 +174,16 @@ bool SALOME::BigObjOnDiskDirectoryDefined()
   return ! SALOME_FILE_BIG_OBJ_DIR.empty();
 }
 
+void SALOME::SetNumberOfRetry(int nbRetry)
+{
+  SALOME_NB_RETRY = nbRetry;
+}
+
+int SALOME::GetNumberOfRetry()
+{
+  return SALOME_NB_RETRY;
+}
+
 static SALOME::PyExecutionMode DefaultPyExecMode = SALOME::PyExecutionMode::NotSet;
 
 void SALOME::SetPyExecutionMode(PyExecutionMode mode)
index 8399fc76f1201e685cc4b1f61015ca10d0e17d5a..d2a7bb222030a0fc634afe81cc95c05df27e7f17 100644 (file)
@@ -48,4 +48,6 @@ namespace SALOME
   std::string BASICS_EXPORT GetBigObjOnDiskDirectory();
   void BASICS_EXPORT SetBigObjOnDiskDirectory(const std::string& directory);
   bool BASICS_EXPORT BigObjOnDiskDirectoryDefined();
+  void BASICS_EXPORT SetNumberOfRetry(int nbRetry);
+  int BASICS_EXPORT GetNumberOfRetry();
 }
index 092c7537ff72bcd58b0ab9c7298658fa7ada0aa2..6c5e8536f6e805312a21cd9e217d0f28c90665dd 100644 (file)
@@ -56,6 +56,8 @@ using namespace SALOME;
 %rename (GetBigObjOnDiskDirectory) GetBigObjOnDiskDirectorySwig;
 %rename (SetBigObjOnDiskDirectory) SetBigObjOnDiskDirectorySwig;
 %rename (BigObjOnDiskDirectoryDefined) BigObjOnDiskDirectoryDefinedSwig;
+%rename (SetNumberOfRetry) SetNumberOfRetrySwig;
+%rename (GetNumberOfRetry) GetNumberOfRetrySwig;
 
 bool getSSLMode();
 void setSSLMode(bool sslMode);
@@ -142,6 +144,16 @@ bool BigObjOnDiskDirectoryDefinedSwig()
   return SALOME::BigObjOnDiskDirectoryDefined();
 }
 
+void SetNumberOfRetrySwig(int nbRetry)
+{
+  SALOME::SetNumberOfRetry( nbRetry );
+}
+
+int GetNumberOfRetrySwig()
+{
+  return SALOME::GetNumberOfRetry( );
+}
+
 void SetVerbosityLevelSwig(const std::string& level)
 {
   SetVerbosityLevelStr(level);
index a12edd4ff553546ffd4532e47d0f6a4bf43ed9aa..4e0d5fe93fd0bd565887c1934bec78595da94a12 100644 (file)
@@ -1180,6 +1180,11 @@ void Abstract_Engines_Container_i::set_big_obj_on_disk_directory(const char *dir
   SALOME::SetBigObjOnDiskDirectory(directory);
 }
 
+void Abstract_Engines_Container_i::set_number_of_retry(CORBA::Long nbRetry)
+{
+  SALOME::SetNumberOfRetry( nbRetry );
+}
+
 Engines::vectorOfString_var FromVecStringCppToCORBA( const std::vector<std::string>& group)
 {
   Engines::vectorOfString_var ret( new Engines::vectorOfString );
index 391c4076d1ba0a92c31100f97935024c1f16a5a1..2ee5639bce06ee691dc02b32a942cfa40fb8ed75 100644 (file)
@@ -246,6 +246,16 @@ void SALOME_ContainerManager::SetBigObjOnDiskDirectory(const char *directory)
   SALOME::SetBigObjOnDiskDirectory(directory);
 }
 
+ void SALOME_ContainerManager::SetNumberOfRetry(CORBA::Long nbRetry)
+ {
+    SALOME::SetNumberOfRetry( nbRetry );
+ }
+
+CORBA::Long SALOME_ContainerManager::GetNumberOfRetry()
+{
+  return SALOME::GetNumberOfRetry();
+}
+
 //=============================================================================
 //! Loop on all the containers listed in naming service, ask shutdown on each
 /*! CORBA Method:
@@ -541,6 +551,7 @@ Engines::Container_ptr SALOME_ContainerManager::GiveContainer(const Engines::Con
         INFOS("[GiveContainer] container " << containerNameInNS << " override " << envInfo.str());
         cont->set_big_obj_on_disk_directory( SALOME::GetBigObjOnDiskDirectory().c_str() );
         cont->set_big_obj_on_disk_threshold( SALOME::GetBigObjOnDiskThreshold() );
+        cont->set_number_of_retry( SALOME::GetNumberOfRetry() );
         Engines::FieldsDict envCorba;
         {
           auto sz = _override_env.size();
index 6119ec393ac3299f073a9b2211a11ea8b7d4ace8..a02bd92587fb00eb25be733ab2182662e43d331f 100644 (file)
@@ -83,6 +83,10 @@ public:
 
   void SetBigObjOnDiskDirectory(const char *directory) override;
 
+  void SetNumberOfRetry(CORBA::Long nbRetry) override;
+
+  CORBA::Long GetNumberOfRetry() override;
+
   static const char *_ContainerManagerNameInNS;
 
 private:
index 4ade76e5baa65675a60f53c523e1c19c691b4a0d..08c174254c09cff57123be2289ca9f97d7fc380f 100644 (file)
@@ -86,6 +86,8 @@ public:
 
   void set_big_obj_on_disk_directory(const char *directory) override;
 
+  void set_number_of_retry(CORBA::Long nbRetry) override;
+
   void addLogFileNameGroup(const Engines::vectorOfString& groupOfLogFileNames) override;
     
   Engines::vectorOfVectorOfString *getAllLogFileNameGroups() override;
index 0069e15c7293b886c779939254b96ae2ae485cd4..0b0c9b3cd66c07b02c19efa156cc3e8c481a715f 100644 (file)
@@ -562,6 +562,9 @@ class GenericPythonMonitoringLauncherCtxMgr:
     
     def __exit__(self,exctype, exc, tb):
         StopMonitoring( self._monitoring_params )
+        del self._monitoring_params
+        import gc
+        gc.collect() # force destruction of objects even in raise context
 
 def StopMonitoring( monitoringInfo ):
   """
@@ -732,6 +735,9 @@ with open(inputFileName,"rb") as f:
 context[MY_PERFORMANCE_LOG_ENTRY_IN_GLBS] = eval( MY_PERFORMANCE_LOG_ENTRY_IN_GLBS )
 with open(codeFileName,"r") as f:
   code = f.read()
+#
+import gc
+gc.disable()
 # go for execution
 exec( code , context )
 # filter part of context to be exported to father process
@@ -838,6 +844,7 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL
 
   #
   def InternalExecResistant( code, context, outargsname):
+    import KernelBasis
     orb = CORBA.ORB_init([''])
     iorScriptLog = orb.object_to_string( instanceOfLogOfCurrentSession._remote_handle )#ref ContainerScriptPerfLog_ptr
     ####
@@ -860,9 +867,14 @@ sys.stderr.flush()""".format( MY_KEY_TO_DETECT_FINISH ) )
       mainExecFileName = os.path.abspath( "mainexecsafe_{}.py".format( RetrieveUniquePartFromPfx( codeFileName  ) ) )
       with open(mainExecFileName,"w") as f:
         f.write( FinalCode.format( codeFileName, contextFileName, resFileName, outargsname, iorScriptLog ) )
-      p = sp.Popen(["python3", mainExecFileName],stdout = sp.PIPE, stderr = sp.PIPE)
-      stdout, stderr = p.communicate()
-      returnCode = p.returncode
+      for iTry in range( KernelBasis.GetNumberOfRetry() ):
+        if iTry > 0:
+          print( "WARNING : Retry # {}. Following code has generated non zero return code ( {} ). Trying again ... \n{}".format( iTry, returnCode, code ) )
+        p = sp.Popen(["python3", mainExecFileName],stdout = sp.PIPE, stderr = sp.PIPE)
+        stdout, stderr = p.communicate()
+        returnCode = p.returncode
+        if returnCode == 0:
+          break
     return returnCode, stdout, stderr, PythonFunctionEvaluatorParams(mainExecFileName,codeFileName,contextFileName,resFileName)
   ret = instanceOfLogOfCurrentSession._current_instance
   returnCode, stdout, stderr, evParams = InternalExecResistant( code, context, outargsname )
@@ -924,7 +936,13 @@ class LogOfCurrentExecutionSession(LogOfCurrentExecutionSessionAbs):
     self.finalizeAndPushToMaster()
 
   def finalizeAndPushToMaster(self):
-    self._remote_handle.assign( pickle.dumps( self._current_instance ) )
+    """
+    Voluntary do nothing in case of problem to avoid to trouble execution
+    """
+    try:
+      self._remote_handle.assign( pickle.dumps( self._current_instance ) )
+    except:
+      pass
 
 class LogOfCurrentExecutionSessionStub(LogOfCurrentExecutionSessionAbs):
   """
@@ -1049,17 +1067,20 @@ class PyScriptNode_Abstract_i(Engines__POA.PyScriptNode,Generic,abc.ABC):
 
   def executeSecond(self,outargsname):
     """ Same than second part of self.execute to reduce memory peak."""
+    def executeSecondInternal(monitoringtimeresms):
+      with GenericPythonMonitoringLauncherCtxMgr( CPUMemoryMonitoring( monitoringtimeresms ) ) as monitoringParams:
+        currentInstance = self.executeNow( outargsname )
+        cpumeminfo = ReadCPUMemInfo( monitoringParams )
+      return cpumeminfo, currentInstance
+
     import sys
     try:
       self.addTimeInfoOnLevel2("startExecTime")
       ##
       self.addInfoOnLevel2("measureTimeResolution",self.my_container_py.monitoringtimeresms())
-      with GenericPythonMonitoringLauncherCtxMgr( CPUMemoryMonitoring( self.my_container_py.monitoringtimeresms() ) ) as monitoringParams:
-        self._current_execution_session._current_instance = self.executeNow( outargsname )
-        cpumeminfo = ReadCPUMemInfo( monitoringParams )
+      cpumeminfo, self._current_execution_session._current_instance = executeSecondInternal( self.my_container_py.monitoringtimeresms() )
       ##
       self.addInfoOnLevel2("CPUMemDuringExec",cpumeminfo)
-      del monitoringParams
       self.addTimeInfoOnLevel2("endExecTime")
       self.addTimeInfoOnLevel2("startOutputTime")
       argsout=[]
index 6b0ca88a8414f8d6b93e65ec0683cfa08442f66f..0d2a168fe91a1e232aa2d770b4991ed398602ad6 100644 (file)
@@ -37,6 +37,7 @@ import subprocess as sp
 killMeCode = """
 import os
 import sys
+import signal
 j = 7 * i
 sys.stdout.write(str(j)) ; sys.stdout.flush() # the aime of test in replay mode to be sure that case is runnable
 os.kill( os.getpid() , signal.SIGKILL)# the aim of test is here
@@ -127,7 +128,7 @@ class testPerfLogManager1(unittest.TestCase):
         # now try to replay the failing case
         p = sp.Popen(["python3",os.path.basename(replayInput[0])],cwd = os.path.dirname(replayInput[0]),stdout=sp.PIPE,stderr=sp.PIPE)
         out,err = p.communicate()
-        self.assertEqual(1,p.returncode) # very important ! The failing case must continue to fail :)
+        self.assertNotEqual(p.returncode,0) # very important ! The failing case must continue to fail :)
         self.assertEqual("21".encode(),out) # very important to check that the reported case is standalone enough to be replayable poste mortem
         # cleanup
         dn = os.path.dirname(replayInput[0])
@@ -177,6 +178,7 @@ class testPerfLogManager1(unittest.TestCase):
         KernelBasis.SetPyExecutionMode("OutOfProcessWithReplayFT")
         hostname = "localhost"
         cp = pylauncher.GetRequestForGiveContainer(hostname,"container_crash_test")
+        salome.cm.SetNumberOfRetry( 3 )
         salome.cm.SetBigObjOnDiskThreshold(1000)
         salome.cm.SetOverrideEnvForContainersSimple(env = [])
         cont = salome.cm.GiveContainer(cp)
@@ -188,7 +190,9 @@ class testPerfLogManager1(unittest.TestCase):
         ret = pickle.loads( SALOME_PyNode.SeqByteReceiver(ret[0]).data() )
         self.assertEqual(ret,27)
         with open(cont.locallogfilename) as f:
-            self.assertTrue( "WARNING : Following code has generated non zero return code" in f.read() )# should report something into the container
+            logCont = f.read( )
+            self.assertTrue( "WARNING : Retry #" in logCont)
+            self.assertTrue( "WARNING : Following code has generated non zero return code" in logCont )# should report something into the container
         cont.Shutdown()
 
 if __name__ == '__main__':