]> SALOME platform Git repositories - modules/kernel.git/commitdiff
Salome HOME
[EDF30875] : Retry on timeout
authorAnthony Geay <anthony.geay@edf.fr>
Fri, 6 Sep 2024 12:06:48 +0000 (14:06 +0200)
committerCédric Aguerre <cedric.aguerre@edf.fr>
Fri, 13 Sep 2024 07:46:59 +0000 (09:46 +0200)
14 files changed:
idl/SALOME_Component.idl
idl/SALOME_LogManager.idl
src/Basics/KernelBasis.cxx
src/Basics/KernelBasis.hxx
src/Basics/KernelBasis.i
src/Container/Container_i.cxx
src/Container/SALOME_ContainerHelper.py
src/Container/SALOME_ContainerManager.cxx
src/Container/SALOME_Container_i.hxx
src/Container/SALOME_PyNode.py
src/Launcher/SALOME_LogManager.cxx
src/Launcher/SALOME_LogManager.hxx
src/Launcher/SALOME_LogManager.py
src/Launcher/Test/testCrashProofContainer.py

index 3e550f8ba34f2069fc376413ab8892ce3df74029..df1d3f4ae6f7560011b8b2cce57267cddae0491d 100644 (file)
@@ -93,6 +93,8 @@ module Engines
     void set_directory_for_replay_files(in string directory);
     
     void set_number_of_retry(in long nbRetry);
+    
+    void set_timeout_before_retrying(in long timeInSecond);
 
     void set_current_directory(in string cwd);
 
index da941f112b9ce83b6020aff35d040d2a212b4417..1ff23f3665c723bbbe988740d35586a67b32549e 100644 (file)
@@ -28,6 +28,7 @@ module Engines
   interface ContainerScriptExecPerfLog
   {
     void assign(in SALOME::vectorOfByte value);
+    void assignAndAppendFreestyle(in SALOME::vectorOfByte value);
     SALOME::vectorOfByte getObj();
   };
 
index e99ef589e0132d7c9f347531a58487394ac2c038..df7246488bd64b4ee3cde3eb5e628f58e21e3c34 100644 (file)
@@ -167,6 +167,10 @@ constexpr int DFT_SALOME_NB_RETRY = 1;
 
 static int SALOME_NB_RETRY = DFT_SALOME_NB_RETRY;
 
+constexpr int DFT_SALOME_PY_EXEC_TIMEOUT = -1;
+
+static int SALOME_PY_EXEC_TIMEOUT = DFT_SALOME_PY_EXEC_TIMEOUT;
+
 static bool SALOME_FW_CWD_STATUS = false;
 
 SALOME::BigObjTransferProtocol SALOME::FromIntToBigObjOnDiskProtocol(char protocol)
@@ -262,6 +266,16 @@ int SALOME::GetNumberOfRetry()
   return SALOME_NB_RETRY;
 }
 
+void SALOME::SetExecutionTimeOut(int timeOutInSecond)
+{
+  SALOME_PY_EXEC_TIMEOUT = timeOutInSecond;
+}
+
+int SALOME::GetExecutionTimeOut()
+{
+  return SALOME_PY_EXEC_TIMEOUT;
+}
+
 static SALOME::PyExecutionMode DefaultPyExecMode = SALOME::PyExecutionMode::NotSet;
 
 void SALOME::SetPyExecutionMode(PyExecutionMode mode)
index de5c1e092422700d04b6443ac9b00c7ac7a54ff0..50d9d3481a62a47409669db3ec9c6e7e9837286a 100644 (file)
@@ -65,6 +65,8 @@ namespace SALOME
   bool BASICS_EXPORT BigObjOnDiskDirectoryDefined();
   void BASICS_EXPORT SetNumberOfRetry(int nbRetry);
   int BASICS_EXPORT GetNumberOfRetry();
+  void BASICS_EXPORT SetExecutionTimeOut(int timeOutInSecond);
+  int BASICS_EXPORT GetExecutionTimeOut();
   bool BASICS_EXPORT GetForwardCurrentDirectoryStatus();
   void BASICS_EXPORT SetForwardCurrentDirectoryStatus(bool newStatus);
   std::string BASICS_EXPORT GetCurrentWorkingDirectory();
index e24211dcaf06bec030eef999e8119bc53ef82c05..06e0e0e785dc91ab96039bfb08831e34520c7b0d 100644 (file)
@@ -62,6 +62,8 @@ using namespace SALOME;
 %rename (SetDirectoryForReplayFiles) SetDirectoryForReplayFilesSwig;
 %rename (SetNumberOfRetry) SetNumberOfRetrySwig;
 %rename (GetNumberOfRetry) GetNumberOfRetrySwig;
+%rename (SetExecutionTimeOut) SetExecutionTimeOutSwig;
+%rename (GetExecutionTimeOut) GetExecutionTimeOutSwig;
 %rename (GetForwardCurrentDirectoryStatus) GetForwardCurrentDirectoryStatusSwig;
 %rename (SetForwardCurrentDirectoryStatus) SetForwardCurrentDirectoryStatusSwig;
 
@@ -155,6 +157,16 @@ int GetNumberOfRetrySwig()
   return SALOME::GetNumberOfRetry( );
 }
 
+void SetExecutionTimeOutSwig(int timeOutInSecond)
+{
+  SALOME::SetExecutionTimeOut( timeOutInSecond );
+}
+
+int GetExecutionTimeOutSwig()
+{
+  return SALOME::GetExecutionTimeOut();
+}
+
 bool GetForwardCurrentDirectoryStatusSwig()
 {
   return SALOME::GetForwardCurrentDirectoryStatus();
index cc07f9fdc656fe7c7b0c80e6ecf75f93be6fa357..64ab988047a0712f2bf0344f4b637a2dfa6b6414 100644 (file)
@@ -1185,6 +1185,11 @@ void Abstract_Engines_Container_i::set_number_of_retry(CORBA::Long nbRetry)
   SALOME::SetNumberOfRetry( nbRetry );
 }
 
+void Abstract_Engines_Container_i::set_timeout_before_retrying(CORBA::Long timeInSecond)
+{
+  SALOME::SetExecutionTimeOut( timeInSecond );
+}
+
 void Abstract_Engines_Container_i::set_current_directory(const char *cwd)
 {
   AutoGIL gstate;
index 82bef6bfad5b04865e6fe31ef592e3dbf1c34797..da46cd6ed7ec2321244b00384c56d8ec2df4ae2c 100644 (file)
@@ -76,7 +76,10 @@ class ScriptExecInfo:
     
     @freestyle.setter
     def freestyle(self, value):
-       self._freestyle_log.append( value )
+       self._freestyle_log = [ value ]
+
+    def preappendFreestyle(self, value):
+       self._freestyle_log = value + self._freestyle_log
 
     @property
     def measureTimeResolution(self):
index 8566a0de80ad5a8e089444676c0391cff055a603..3d30d9a1a82f8505cc345f8f1014c3c2208ca870 100644 (file)
@@ -558,6 +558,7 @@ Engines::Container_ptr SALOME_ContainerManager::GiveContainer(const Engines::Con
         cont->set_big_obj_on_disk_threshold( SALOME::GetBigObjOnDiskThreshold() );
         cont->set_directory_for_replay_files( SALOME::GetDirectoryForReplayFiles().c_str() );
         cont->set_number_of_retry( SALOME::GetNumberOfRetry() );
+        cont->set_timeout_before_retrying( SALOME::GetExecutionTimeOut() );
         Engines::FieldsDict envCorba;
         {
           auto sz = _override_env.size();
index ae96b0544c168b94037ca2b244b5ca4a83b0ddc2..592a61a0c864ddfa547bb5c8eabd3043f4ec701f 100644 (file)
@@ -88,6 +88,8 @@ public:
   void set_big_obj_on_disk_directory(const char *directory) override;
 
   void set_number_of_retry(CORBA::Long nbRetry) override;
+  
+  void set_timeout_before_retrying(CORBA::Long timeInSecond) override;
 
   void set_current_directory(const char *cwd) override;
 
index 9c964354d6d33d5d99e6a9bd626446eda5533b58..062bea530318ef33ea1152698a7e0ad2dee819de 100644 (file)
@@ -855,9 +855,11 @@ class SeqByteReceiver:
   
 FinalCode = """import pickle
 from SALOME_PyNode import LogOfCurrentExecutionSession,MY_PERFORMANCE_LOG_ENTRY_IN_GLBS
+from KernelBasis import VerbosityActivated
 import CORBA
 import Engines
 import os
+from datetime import datetime
 # WorkDir may be important to replay : "{}"
 orb = CORBA.ORB_init([''])
 caseDirectory = "{}"
@@ -867,18 +869,30 @@ outputFileName = os.path.join( caseDirectory, "{}" )
 del os
 outputsKeys = {}
 exec( "{{}} = LogOfCurrentExecutionSession( orb.string_to_object( \\"{}\\" ) )".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) )
+if VerbosityActivated():
+  eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("b4loadctx",datetime.now()) )
 with open(inputFileName,"rb") as f:
   context = pickle.load( f )
+if VerbosityActivated():
+  eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("afterloadctx",datetime.now()) )
 context[MY_PERFORMANCE_LOG_ENTRY_IN_GLBS] = eval( MY_PERFORMANCE_LOG_ENTRY_IN_GLBS )
 with open(codeFileName,"r") as f:
   code = f.read()
 # go for execution
+if VerbosityActivated():
+  eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("bforeexec",datetime.now()) )
 exec( code , context )
+if VerbosityActivated():
+  eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("afterexec",datetime.now()) )
 # filter part of context to be exported to father process
 context = dict( [(k,v) for k,v in context.items() if k in outputsKeys] )
+if VerbosityActivated():
+  eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("strtdumpout",datetime.now()) )
 #
 with open(outputFileName,"wb") as f:
   pickle.dump( context, f )
+if VerbosityActivated():
+  eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("afterdump",datetime.now()) )
 """
 
 class PythonFunctionEvaluatorParams:
@@ -963,6 +977,7 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL
   import pickle
   import subprocess as sp
   import CORBA
+  import logging
   #
   def IsConsideredAsOKRun( returnCode, closeEyesOnErrorAtExit , stderr ):
     def StdErrTreatment(closeEyesOnErrorAtExit , stderr):
@@ -1009,13 +1024,26 @@ sys.stderr.flush()""".format( MY_KEY_TO_DETECT_FINISH ) )
       mainExecFileName = os.path.join( dirForReplayFiles, "mainexecsafe_{}.py".format( RetrieveUniquePartFromPfx( codeFileName  ) ) )
       with open(mainExecFileName,"w") as f:
         f.write( FinalCode.format( os.getcwd(), dirForReplayFiles, codeFileName, os.path.basename( contextFileName ), os.path.basename( resFileName ), outargsname, iorScriptLog ) )
-      for iTry in range( KernelBasis.GetNumberOfRetry() ):
+      timeOut = KernelBasis.GetExecutionTimeOut()
+      nbRetry = KernelBasis.GetNumberOfRetry()
+      logging.debug( "Nb retry = {}   Timout in seconds = {}".format( nbRetry, timeOut ) )
+      for iTry in range( nbRetry ):
         if iTry > 0:
           print( "WARNING : Retry # {}. Following code has generated non zero return code ( {} ). Trying again ... \n{}".format( iTry, returnCode, code ) )
         p = sp.Popen(["python3", mainExecFileName],cwd = os.getcwd(),stdout = sp.PIPE, stderr = sp.PIPE)
-        stdout, stderr = p.communicate()
-        returnCode = p.returncode
+        try:
+          args = {}
+          if timeOut > 0:
+            args["timeout"] = timeOut
+          stdout, stderr = p.communicate( **args )
+        except sp.TimeoutExpired as e:
+          print( "WARNING : during retry #{} timeout set to {} s has failed !".format( iTry, timeOut ) )
+          returnCode = 10000000000 + timeOut
+        else:
+          returnCode = p.returncode
         if returnCode == 0:
+          if iTry >= 1:
+            logging.warning( "At Retry #{} it's successful :)".format(iTry) )
           break
     return returnCode, stdout, stderr, PythonFunctionEvaluatorParams(mainExecFileName,codeFileNameFull,contextFileName,resFileName)
   ret = instanceOfLogOfCurrentSession._current_instance
@@ -1075,7 +1103,7 @@ class LogOfCurrentExecutionSession(LogOfCurrentExecutionSessionAbs):
 
   def addFreestyleAndFlush(self, value):
     self._current_instance.freestyle = value
-    self.finalizeAndPushToMaster()
+    self.finalizeAndPushToMasterAppendFreestyle()
 
   def finalizeAndPushToMaster(self):
     """
@@ -1085,6 +1113,15 @@ class LogOfCurrentExecutionSession(LogOfCurrentExecutionSessionAbs):
       self._remote_handle.assign( pickle.dumps( self._current_instance ) )
     except:
       pass
+    
+  def finalizeAndPushToMasterAppendFreestyle(self):
+    """
+    Voluntary do nothing in case of problem to avoid to trouble execution
+    """
+    try:
+      self._remote_handle.assignAndAppendFreestyle( pickle.dumps( self._current_instance ) )
+    except:
+      pass
 
 class LogOfCurrentExecutionSessionStub(LogOfCurrentExecutionSessionAbs):
   """
index f006eb24575c3cd4dade1993e9181e91ec77988e..1a83a2ce5539eb97022f311cb171f5475689e232 100644 (file)
@@ -82,6 +82,29 @@ void SALOME_ContainerScriptExecPerfLog::assign(const SALOME::vectorOfByte& value
   _data = FromPyToCpp(s);
 }
 
+void SALOME_ContainerScriptExecPerfLog::assignAndAppendFreestyle(const SALOME::vectorOfByte& value)
+{
+  auto sz = value.length();
+  std::vector<char> data( sz );
+  for(auto i = 0 ; i < sz ; ++i)
+    data[i] = value[i];
+  //
+  {
+    AutoGIL gstate;
+    //https://docs.python.org/3/c-api/arg.html#c.Py_BuildValue
+    AutoPyRef dataPyFromRemote(PyBytes_FromStringAndSize(data.data(),data.size()));
+    AutoPyRef dataPyAlreadyOnSite(PyBytes_FromStringAndSize(_data.data(),_data.size()));
+    AutoPyRef result(PyObject_CallMethod(pyObj(),(char*)"flushAndAppendFreestyle","OO",dataPyAlreadyOnSite.get(),dataPyFromRemote.get(),nullptr) ) ;
+    if (PyErr_Occurred())
+    {
+      std::string error("can not assignAndAppendFreestyle");
+      PyErr_Print();
+      THROW_SALOME_CORBA_EXCEPTION(error.c_str(),SALOME::INTERNAL_ERROR);
+    }
+    _data = FromPyToCpp(result);
+  }
+}
+
 SALOME::vectorOfByte *SALOME_ContainerScriptExecPerfLog::getObj()
 {
   return FromVectCharToCorba(this->_data);
index 33d9f2cf83455f70439d6fc4cbc8cae261fe91f7..d201ed1e042faa0b2008af8a68f0a39943c27de4 100644 (file)
@@ -106,6 +106,7 @@ public:
   PyObject *pyObj() { return _pyExecutionLog.get(); }
   PortableServer::POA_var getPOA();
   void assign(const SALOME::vectorOfByte& value) override;
+  void assignAndAppendFreestyle(const SALOME::vectorOfByte& value) override;
   SALOME::vectorOfByte *getObj() override;
   const std::vector<char>& data() const { return _data; }
   void setData(std::vector<char>&& data) { _data = std::move(data); }
index 2af278a4be2ea446251ed9bd4ca656236342349b..58114fc413be3453069c667f74d276da0f534c37 100644 (file)
@@ -39,6 +39,22 @@ class SALOME_ContainerScriptExecPerfLog:
     setattr(obj,"tracePosStop",self._stop_pos)
     setattr(obj,"tracePosStart",self._start_pos)
     return pickle.dumps(obj)
+  
+  def flushAndAppendFreestyle(self,alreadyOnSiteBytes,instanceRemoteBytes):
+    """
+    :param alreadyOnSiteBytes: pickle of instance of ScriptExecInfo of previous value (if any) [bytes]
+    :param instanceRemoteBytes: pickle of current instance of ScriptExecInfo (if any) [bytes]
+    """
+    alreadyOnSite = None
+    if len( alreadyOnSiteBytes ) > 0:
+      alreadyOnSite = pickle.loads(alreadyOnSiteBytes)
+    instanceRemote = pickle.loads(instanceRemoteBytes)
+    self._stop_pos = os.path.getsize( self.father.father.logfilename )
+    setattr(instanceRemote,"tracePosStop",self._stop_pos)
+    setattr(instanceRemote,"tracePosStart",self._start_pos)
+    if alreadyOnSite:
+      instanceRemote.preappendFreestyle( alreadyOnSite._freestyle_log )
+    return pickle.dumps(instanceRemote)
 
   def start(self):
     self._start_pos = os.path.getsize( self.father.father.logfilename )
index d05e72461614f212321905698383cdbf75d05825..aad434dbfcdfafb11775093d623bb0e95e411cf5 100644 (file)
@@ -34,6 +34,7 @@ import tempfile
 import logging
 from datetime import datetime
 import subprocess as sp
+from pathlib import Path
 
 killMeCode = """
 import os
@@ -83,6 +84,15 @@ k = os.getcwd()
 print("OKKKKKK3333")
 """
 
+FunnyCase = """import os
+import time
+if not os.path.exists( {!r} ):
+    time.sleep( 20 ) # first exec
+else:
+    time.sleep( 2 )  # second exec after retry
+j = 44
+"""
+
 class testPerfLogManager1(unittest.TestCase):
     def test0(self):
         """
@@ -111,7 +121,8 @@ class testPerfLogManager1(unittest.TestCase):
             ret = pickle.loads( SALOME_PyNode.SeqByteReceiver(ret[0]).data() )
             self.assertEqual(ret,24) # container has received a SIGKILL but it kindly continue to respond :)
             a = salome.logm.NaiveFetch()
-            self.assertEqual(a[0][2][0].get().freestyle,[('a',777)])
+            dicToTest = {k:v for k,v in a[0][2][0].get().freestyle}
+            self.assertEqual(dicToTest['a'],777)
             cont.Shutdown()
 
     def test1(self):
@@ -141,7 +152,10 @@ class testPerfLogManager1(unittest.TestCase):
             ret = pickle.loads( SALOME_PyNode.SeqByteReceiver(ret[0]).data() )
             self.assertEqual(ret,24) # container has received a SIGKILL but it kindly continue to respond :)
             a = salome.logm.NaiveFetch()
-            self.assertEqual(a[0][2][0].get().freestyle,[('a',777)])
+            dicToTest = {k:v for k,v in a[0][2][0].get().freestyle}
+            self.assertEqual(dicToTest['a'],777)
+            # EDF30875 : check in verbose mode of presence of entries to debug
+            self.assertEqual( set(dicToTest), {'strtdumpout', 'b4loadctx', 'afterloadctx', 'afterdump', 'bforeexec', 'a', 'afterexec'} )
             grpsOfLogToKill = cont.getAllLogFileNameGroups()
             self.assertEqual(1,len(grpsOfLogToKill))
             replayInput = grpsOfLogToKill[0]
@@ -248,6 +262,45 @@ class testPerfLogManager1(unittest.TestCase):
             self.assertEqual( ret2, str(tmpdirname) )
             cont.Shutdown()
 
+    def test5(self):
+        """
+        EDF30875 : Verbose mode set to ON 
+        """
+        from threading import Thread
+        def func( fname ):
+            import time
+            time.sleep( 5 )
+            with open( fname, "w" ) as f:
+                f.write( "go" )
+        # file used to pilot the behaviour of process
+        fname = "touch.txt"
+        KernelBasis.SetNumberOfRetry(2)
+        KernelBasis.SetExecutionTimeOut(10) # <= Key Point is here
+        KernelBasis.SetPyExecutionMode("OutOfProcessNoReplayFT") # Fail tolerant
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            os.chdir( tmpdirname )
+            fnameFull = Path(tmpdirname) / fname
+            hostname = "localhost"
+            cp = pylauncher.GetRequestForGiveContainer(hostname,"container_crash_test")
+            salome.cm.SetDirectoryForReplayFiles( str( tmpdirname ) )
+            with salome.ContainerLauncherCM(cp,True) as cont:
+                poa = salome.orb.resolve_initial_references("RootPOA")
+                obj = SALOME_PyNode.SenderByte_i(poa,pickle.dumps( (["i"],{"i": 3} ) )) ; id_o = poa.activate_object(obj) ; refPtr = poa.id_to_reference(id_o)
+                pyscript = cont.createPyScriptNode("testScript5",FunnyCase.format( fnameFull.as_posix() ))
+                t = Thread(target = func,args=[fnameFull.as_posix()])
+                t.start()
+                pyscript.executeFirst(refPtr)
+                ret = pyscript.executeSecond(["j"])
+                t.join()
+                ret0 = pickle.loads( SALOME_PyNode.SeqByteReceiver(ret[0]).data() )
+                self.assertEqual(ret0,44)
+                a = salome.logm.NaiveFetch()
+                #res = a[0][1][0]
+                #b = SALOME_ContainerHelper.ScriptExecInfoDeco(res,a[0][1]) print( a[0].log) 
+                # print( res.get().freestyle )
+                #print( [elt[0] for elt in res.get().freestyle] )
+                #self.assertEqual( [elt[0] for elt in res.get().freestyle], ['b4loadctx', 'afterloadctx', 'bforeexec', 'b4loadctx', 'afterloadctx', 'bforeexec', 'afterexec', 'strtdumpout', 'afterdump'] )
+
 if __name__ == '__main__':
     from salome_utils import positionVerbosityOfLoggerRegardingState,setVerboseLevel,setVerbose
     salome.standalone()