From 5e9167f08507888538f48287a338201475fc88e3 Mon Sep 17 00:00:00 2001 From: Anthony Geay Date: Fri, 6 Sep 2024 14:06:48 +0200 Subject: [PATCH] [EDF30875] : Retry on timeout --- idl/SALOME_Component.idl | 2 + idl/SALOME_LogManager.idl | 1 + src/Basics/KernelBasis.cxx | 14 +++++ src/Basics/KernelBasis.hxx | 2 + src/Basics/KernelBasis.i | 12 +++++ src/Container/Container_i.cxx | 5 ++ src/Container/SALOME_ContainerHelper.py | 5 +- src/Container/SALOME_ContainerManager.cxx | 1 + src/Container/SALOME_Container_i.hxx | 2 + src/Container/SALOME_PyNode.py | 45 ++++++++++++++-- src/Launcher/SALOME_LogManager.cxx | 23 ++++++++ src/Launcher/SALOME_LogManager.hxx | 1 + src/Launcher/SALOME_LogManager.py | 16 ++++++ src/Launcher/Test/testCrashProofContainer.py | 57 +++++++++++++++++++- 14 files changed, 179 insertions(+), 7 deletions(-) diff --git a/idl/SALOME_Component.idl b/idl/SALOME_Component.idl index 3e550f8ba..df1d3f4ae 100644 --- a/idl/SALOME_Component.idl +++ b/idl/SALOME_Component.idl @@ -93,6 +93,8 @@ module Engines void set_directory_for_replay_files(in string directory); void set_number_of_retry(in long nbRetry); + + void set_timeout_before_retrying(in long timeInSecond); void set_current_directory(in string cwd); diff --git a/idl/SALOME_LogManager.idl b/idl/SALOME_LogManager.idl index da941f112..1ff23f366 100644 --- a/idl/SALOME_LogManager.idl +++ b/idl/SALOME_LogManager.idl @@ -28,6 +28,7 @@ module Engines interface ContainerScriptExecPerfLog { void assign(in SALOME::vectorOfByte value); + void assignAndAppendFreestyle(in SALOME::vectorOfByte value); SALOME::vectorOfByte getObj(); }; diff --git a/src/Basics/KernelBasis.cxx b/src/Basics/KernelBasis.cxx index e99ef589e..df7246488 100644 --- a/src/Basics/KernelBasis.cxx +++ b/src/Basics/KernelBasis.cxx @@ -167,6 +167,10 @@ constexpr int DFT_SALOME_NB_RETRY = 1; static int SALOME_NB_RETRY = DFT_SALOME_NB_RETRY; +constexpr int DFT_SALOME_PY_EXEC_TIMEOUT = -1; + +static int SALOME_PY_EXEC_TIMEOUT = DFT_SALOME_PY_EXEC_TIMEOUT; + static bool SALOME_FW_CWD_STATUS = false; SALOME::BigObjTransferProtocol SALOME::FromIntToBigObjOnDiskProtocol(char protocol) @@ -262,6 +266,16 @@ int SALOME::GetNumberOfRetry() return SALOME_NB_RETRY; } +void SALOME::SetExecutionTimeOut(int timeOutInSecond) +{ + SALOME_PY_EXEC_TIMEOUT = timeOutInSecond; +} + +int SALOME::GetExecutionTimeOut() +{ + return SALOME_PY_EXEC_TIMEOUT; +} + static SALOME::PyExecutionMode DefaultPyExecMode = SALOME::PyExecutionMode::NotSet; void SALOME::SetPyExecutionMode(PyExecutionMode mode) diff --git a/src/Basics/KernelBasis.hxx b/src/Basics/KernelBasis.hxx index de5c1e092..50d9d3481 100644 --- a/src/Basics/KernelBasis.hxx +++ b/src/Basics/KernelBasis.hxx @@ -65,6 +65,8 @@ namespace SALOME bool BASICS_EXPORT BigObjOnDiskDirectoryDefined(); void BASICS_EXPORT SetNumberOfRetry(int nbRetry); int BASICS_EXPORT GetNumberOfRetry(); + void BASICS_EXPORT SetExecutionTimeOut(int timeOutInSecond); + int BASICS_EXPORT GetExecutionTimeOut(); bool BASICS_EXPORT GetForwardCurrentDirectoryStatus(); void BASICS_EXPORT SetForwardCurrentDirectoryStatus(bool newStatus); std::string BASICS_EXPORT GetCurrentWorkingDirectory(); diff --git a/src/Basics/KernelBasis.i b/src/Basics/KernelBasis.i index e24211dca..06e0e0e78 100644 --- a/src/Basics/KernelBasis.i +++ b/src/Basics/KernelBasis.i @@ -62,6 +62,8 @@ using namespace SALOME; %rename (SetDirectoryForReplayFiles) SetDirectoryForReplayFilesSwig; %rename (SetNumberOfRetry) SetNumberOfRetrySwig; %rename (GetNumberOfRetry) GetNumberOfRetrySwig; +%rename (SetExecutionTimeOut) SetExecutionTimeOutSwig; +%rename (GetExecutionTimeOut) GetExecutionTimeOutSwig; %rename (GetForwardCurrentDirectoryStatus) GetForwardCurrentDirectoryStatusSwig; %rename (SetForwardCurrentDirectoryStatus) SetForwardCurrentDirectoryStatusSwig; @@ -155,6 +157,16 @@ int GetNumberOfRetrySwig() return SALOME::GetNumberOfRetry( ); } +void SetExecutionTimeOutSwig(int timeOutInSecond) +{ + SALOME::SetExecutionTimeOut( timeOutInSecond ); +} + +int GetExecutionTimeOutSwig() +{ + return SALOME::GetExecutionTimeOut(); +} + bool GetForwardCurrentDirectoryStatusSwig() { return SALOME::GetForwardCurrentDirectoryStatus(); diff --git a/src/Container/Container_i.cxx b/src/Container/Container_i.cxx index cc07f9fdc..64ab98804 100644 --- a/src/Container/Container_i.cxx +++ b/src/Container/Container_i.cxx @@ -1185,6 +1185,11 @@ void Abstract_Engines_Container_i::set_number_of_retry(CORBA::Long nbRetry) SALOME::SetNumberOfRetry( nbRetry ); } +void Abstract_Engines_Container_i::set_timeout_before_retrying(CORBA::Long timeInSecond) +{ + SALOME::SetExecutionTimeOut( timeInSecond ); +} + void Abstract_Engines_Container_i::set_current_directory(const char *cwd) { AutoGIL gstate; diff --git a/src/Container/SALOME_ContainerHelper.py b/src/Container/SALOME_ContainerHelper.py index 82bef6bfa..da46cd6ed 100644 --- a/src/Container/SALOME_ContainerHelper.py +++ b/src/Container/SALOME_ContainerHelper.py @@ -76,7 +76,10 @@ class ScriptExecInfo: @freestyle.setter def freestyle(self, value): - self._freestyle_log.append( value ) + self._freestyle_log = [ value ] + + def preappendFreestyle(self, value): + self._freestyle_log = value + self._freestyle_log @property def measureTimeResolution(self): diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index 8566a0de8..3d30d9a1a 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -558,6 +558,7 @@ Engines::Container_ptr SALOME_ContainerManager::GiveContainer(const Engines::Con cont->set_big_obj_on_disk_threshold( SALOME::GetBigObjOnDiskThreshold() ); cont->set_directory_for_replay_files( SALOME::GetDirectoryForReplayFiles().c_str() ); cont->set_number_of_retry( SALOME::GetNumberOfRetry() ); + cont->set_timeout_before_retrying( SALOME::GetExecutionTimeOut() ); Engines::FieldsDict envCorba; { auto sz = _override_env.size(); diff --git a/src/Container/SALOME_Container_i.hxx b/src/Container/SALOME_Container_i.hxx index ae96b0544..592a61a0c 100644 --- a/src/Container/SALOME_Container_i.hxx +++ b/src/Container/SALOME_Container_i.hxx @@ -88,6 +88,8 @@ public: void set_big_obj_on_disk_directory(const char *directory) override; void set_number_of_retry(CORBA::Long nbRetry) override; + + void set_timeout_before_retrying(CORBA::Long timeInSecond) override; void set_current_directory(const char *cwd) override; diff --git a/src/Container/SALOME_PyNode.py b/src/Container/SALOME_PyNode.py index 9c964354d..062bea530 100644 --- a/src/Container/SALOME_PyNode.py +++ b/src/Container/SALOME_PyNode.py @@ -855,9 +855,11 @@ class SeqByteReceiver: FinalCode = """import pickle from SALOME_PyNode import LogOfCurrentExecutionSession,MY_PERFORMANCE_LOG_ENTRY_IN_GLBS +from KernelBasis import VerbosityActivated import CORBA import Engines import os +from datetime import datetime # WorkDir may be important to replay : "{}" orb = CORBA.ORB_init(['']) caseDirectory = "{}" @@ -867,18 +869,30 @@ outputFileName = os.path.join( caseDirectory, "{}" ) del os outputsKeys = {} exec( "{{}} = LogOfCurrentExecutionSession( orb.string_to_object( \\"{}\\" ) )".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ) +if VerbosityActivated(): + eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("b4loadctx",datetime.now()) ) with open(inputFileName,"rb") as f: context = pickle.load( f ) +if VerbosityActivated(): + eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("afterloadctx",datetime.now()) ) context[MY_PERFORMANCE_LOG_ENTRY_IN_GLBS] = eval( MY_PERFORMANCE_LOG_ENTRY_IN_GLBS ) with open(codeFileName,"r") as f: code = f.read() # go for execution +if VerbosityActivated(): + eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("bforeexec",datetime.now()) ) exec( code , context ) +if VerbosityActivated(): + eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("afterexec",datetime.now()) ) # filter part of context to be exported to father process context = dict( [(k,v) for k,v in context.items() if k in outputsKeys] ) +if VerbosityActivated(): + eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("strtdumpout",datetime.now()) ) # with open(outputFileName,"wb") as f: pickle.dump( context, f ) +if VerbosityActivated(): + eval( "{{}}".format(MY_PERFORMANCE_LOG_ENTRY_IN_GLBS) ).addFreestyleAndFlush( ("afterdump",datetime.now()) ) """ class PythonFunctionEvaluatorParams: @@ -963,6 +977,7 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL import pickle import subprocess as sp import CORBA + import logging # def IsConsideredAsOKRun( returnCode, closeEyesOnErrorAtExit , stderr ): def StdErrTreatment(closeEyesOnErrorAtExit , stderr): @@ -1009,13 +1024,26 @@ sys.stderr.flush()""".format( MY_KEY_TO_DETECT_FINISH ) ) mainExecFileName = os.path.join( dirForReplayFiles, "mainexecsafe_{}.py".format( RetrieveUniquePartFromPfx( codeFileName ) ) ) with open(mainExecFileName,"w") as f: f.write( FinalCode.format( os.getcwd(), dirForReplayFiles, codeFileName, os.path.basename( contextFileName ), os.path.basename( resFileName ), outargsname, iorScriptLog ) ) - for iTry in range( KernelBasis.GetNumberOfRetry() ): + timeOut = KernelBasis.GetExecutionTimeOut() + nbRetry = KernelBasis.GetNumberOfRetry() + logging.debug( "Nb retry = {} Timout in seconds = {}".format( nbRetry, timeOut ) ) + for iTry in range( nbRetry ): if iTry > 0: print( "WARNING : Retry # {}. Following code has generated non zero return code ( {} ). Trying again ... \n{}".format( iTry, returnCode, code ) ) p = sp.Popen(["python3", mainExecFileName],cwd = os.getcwd(),stdout = sp.PIPE, stderr = sp.PIPE) - stdout, stderr = p.communicate() - returnCode = p.returncode + try: + args = {} + if timeOut > 0: + args["timeout"] = timeOut + stdout, stderr = p.communicate( **args ) + except sp.TimeoutExpired as e: + print( "WARNING : during retry #{} timeout set to {} s has failed !".format( iTry, timeOut ) ) + returnCode = 10000000000 + timeOut + else: + returnCode = p.returncode if returnCode == 0: + if iTry >= 1: + logging.warning( "At Retry #{} it's successful :)".format(iTry) ) break return returnCode, stdout, stderr, PythonFunctionEvaluatorParams(mainExecFileName,codeFileNameFull,contextFileName,resFileName) ret = instanceOfLogOfCurrentSession._current_instance @@ -1075,7 +1103,7 @@ class LogOfCurrentExecutionSession(LogOfCurrentExecutionSessionAbs): def addFreestyleAndFlush(self, value): self._current_instance.freestyle = value - self.finalizeAndPushToMaster() + self.finalizeAndPushToMasterAppendFreestyle() def finalizeAndPushToMaster(self): """ @@ -1085,6 +1113,15 @@ class LogOfCurrentExecutionSession(LogOfCurrentExecutionSessionAbs): self._remote_handle.assign( pickle.dumps( self._current_instance ) ) except: pass + + def finalizeAndPushToMasterAppendFreestyle(self): + """ + Voluntary do nothing in case of problem to avoid to trouble execution + """ + try: + self._remote_handle.assignAndAppendFreestyle( pickle.dumps( self._current_instance ) ) + except: + pass class LogOfCurrentExecutionSessionStub(LogOfCurrentExecutionSessionAbs): """ diff --git a/src/Launcher/SALOME_LogManager.cxx b/src/Launcher/SALOME_LogManager.cxx index f006eb245..1a83a2ce5 100644 --- a/src/Launcher/SALOME_LogManager.cxx +++ b/src/Launcher/SALOME_LogManager.cxx @@ -82,6 +82,29 @@ void SALOME_ContainerScriptExecPerfLog::assign(const SALOME::vectorOfByte& value _data = FromPyToCpp(s); } +void SALOME_ContainerScriptExecPerfLog::assignAndAppendFreestyle(const SALOME::vectorOfByte& value) +{ + auto sz = value.length(); + std::vector data( sz ); + for(auto i = 0 ; i < sz ; ++i) + data[i] = value[i]; + // + { + AutoGIL gstate; + //https://docs.python.org/3/c-api/arg.html#c.Py_BuildValue + AutoPyRef dataPyFromRemote(PyBytes_FromStringAndSize(data.data(),data.size())); + AutoPyRef dataPyAlreadyOnSite(PyBytes_FromStringAndSize(_data.data(),_data.size())); + AutoPyRef result(PyObject_CallMethod(pyObj(),(char*)"flushAndAppendFreestyle","OO",dataPyAlreadyOnSite.get(),dataPyFromRemote.get(),nullptr) ) ; + if (PyErr_Occurred()) + { + std::string error("can not assignAndAppendFreestyle"); + PyErr_Print(); + THROW_SALOME_CORBA_EXCEPTION(error.c_str(),SALOME::INTERNAL_ERROR); + } + _data = FromPyToCpp(result); + } +} + SALOME::vectorOfByte *SALOME_ContainerScriptExecPerfLog::getObj() { return FromVectCharToCorba(this->_data); diff --git a/src/Launcher/SALOME_LogManager.hxx b/src/Launcher/SALOME_LogManager.hxx index 33d9f2cf8..d201ed1e0 100644 --- a/src/Launcher/SALOME_LogManager.hxx +++ b/src/Launcher/SALOME_LogManager.hxx @@ -106,6 +106,7 @@ public: PyObject *pyObj() { return _pyExecutionLog.get(); } PortableServer::POA_var getPOA(); void assign(const SALOME::vectorOfByte& value) override; + void assignAndAppendFreestyle(const SALOME::vectorOfByte& value) override; SALOME::vectorOfByte *getObj() override; const std::vector& data() const { return _data; } void setData(std::vector&& data) { _data = std::move(data); } diff --git a/src/Launcher/SALOME_LogManager.py b/src/Launcher/SALOME_LogManager.py index 2af278a4b..58114fc41 100644 --- a/src/Launcher/SALOME_LogManager.py +++ b/src/Launcher/SALOME_LogManager.py @@ -39,6 +39,22 @@ class SALOME_ContainerScriptExecPerfLog: setattr(obj,"tracePosStop",self._stop_pos) setattr(obj,"tracePosStart",self._start_pos) return pickle.dumps(obj) + + def flushAndAppendFreestyle(self,alreadyOnSiteBytes,instanceRemoteBytes): + """ + :param alreadyOnSiteBytes: pickle of instance of ScriptExecInfo of previous value (if any) [bytes] + :param instanceRemoteBytes: pickle of current instance of ScriptExecInfo (if any) [bytes] + """ + alreadyOnSite = None + if len( alreadyOnSiteBytes ) > 0: + alreadyOnSite = pickle.loads(alreadyOnSiteBytes) + instanceRemote = pickle.loads(instanceRemoteBytes) + self._stop_pos = os.path.getsize( self.father.father.logfilename ) + setattr(instanceRemote,"tracePosStop",self._stop_pos) + setattr(instanceRemote,"tracePosStart",self._start_pos) + if alreadyOnSite: + instanceRemote.preappendFreestyle( alreadyOnSite._freestyle_log ) + return pickle.dumps(instanceRemote) def start(self): self._start_pos = os.path.getsize( self.father.father.logfilename ) diff --git a/src/Launcher/Test/testCrashProofContainer.py b/src/Launcher/Test/testCrashProofContainer.py index d05e72461..aad434dbf 100644 --- a/src/Launcher/Test/testCrashProofContainer.py +++ b/src/Launcher/Test/testCrashProofContainer.py @@ -34,6 +34,7 @@ import tempfile import logging from datetime import datetime import subprocess as sp +from pathlib import Path killMeCode = """ import os @@ -83,6 +84,15 @@ k = os.getcwd() print("OKKKKKK3333") """ +FunnyCase = """import os +import time +if not os.path.exists( {!r} ): + time.sleep( 20 ) # first exec +else: + time.sleep( 2 ) # second exec after retry +j = 44 +""" + class testPerfLogManager1(unittest.TestCase): def test0(self): """ @@ -111,7 +121,8 @@ class testPerfLogManager1(unittest.TestCase): ret = pickle.loads( SALOME_PyNode.SeqByteReceiver(ret[0]).data() ) self.assertEqual(ret,24) # container has received a SIGKILL but it kindly continue to respond :) a = salome.logm.NaiveFetch() - self.assertEqual(a[0][2][0].get().freestyle,[('a',777)]) + dicToTest = {k:v for k,v in a[0][2][0].get().freestyle} + self.assertEqual(dicToTest['a'],777) cont.Shutdown() def test1(self): @@ -141,7 +152,10 @@ class testPerfLogManager1(unittest.TestCase): ret = pickle.loads( SALOME_PyNode.SeqByteReceiver(ret[0]).data() ) self.assertEqual(ret,24) # container has received a SIGKILL but it kindly continue to respond :) a = salome.logm.NaiveFetch() - self.assertEqual(a[0][2][0].get().freestyle,[('a',777)]) + dicToTest = {k:v for k,v in a[0][2][0].get().freestyle} + self.assertEqual(dicToTest['a'],777) + # EDF30875 : check in verbose mode of presence of entries to debug + self.assertEqual( set(dicToTest), {'strtdumpout', 'b4loadctx', 'afterloadctx', 'afterdump', 'bforeexec', 'a', 'afterexec'} ) grpsOfLogToKill = cont.getAllLogFileNameGroups() self.assertEqual(1,len(grpsOfLogToKill)) replayInput = grpsOfLogToKill[0] @@ -248,6 +262,45 @@ class testPerfLogManager1(unittest.TestCase): self.assertEqual( ret2, str(tmpdirname) ) cont.Shutdown() + def test5(self): + """ + EDF30875 : Verbose mode set to ON + """ + from threading import Thread + def func( fname ): + import time + time.sleep( 5 ) + with open( fname, "w" ) as f: + f.write( "go" ) + # file used to pilot the behaviour of process + fname = "touch.txt" + KernelBasis.SetNumberOfRetry(2) + KernelBasis.SetExecutionTimeOut(10) # <= Key Point is here + KernelBasis.SetPyExecutionMode("OutOfProcessNoReplayFT") # Fail tolerant + with tempfile.TemporaryDirectory() as tmpdirname: + os.chdir( tmpdirname ) + fnameFull = Path(tmpdirname) / fname + hostname = "localhost" + cp = pylauncher.GetRequestForGiveContainer(hostname,"container_crash_test") + salome.cm.SetDirectoryForReplayFiles( str( tmpdirname ) ) + with salome.ContainerLauncherCM(cp,True) as cont: + poa = salome.orb.resolve_initial_references("RootPOA") + obj = SALOME_PyNode.SenderByte_i(poa,pickle.dumps( (["i"],{"i": 3} ) )) ; id_o = poa.activate_object(obj) ; refPtr = poa.id_to_reference(id_o) + pyscript = cont.createPyScriptNode("testScript5",FunnyCase.format( fnameFull.as_posix() )) + t = Thread(target = func,args=[fnameFull.as_posix()]) + t.start() + pyscript.executeFirst(refPtr) + ret = pyscript.executeSecond(["j"]) + t.join() + ret0 = pickle.loads( SALOME_PyNode.SeqByteReceiver(ret[0]).data() ) + self.assertEqual(ret0,44) + a = salome.logm.NaiveFetch() + #res = a[0][1][0] + #b = SALOME_ContainerHelper.ScriptExecInfoDeco(res,a[0][1]) print( a[0].log) + # print( res.get().freestyle ) + #print( [elt[0] for elt in res.get().freestyle] ) + #self.assertEqual( [elt[0] for elt in res.get().freestyle], ['b4loadctx', 'afterloadctx', 'bforeexec', 'b4loadctx', 'afterloadctx', 'bforeexec', 'afterexec', 'strtdumpout', 'afterdump'] ) + if __name__ == '__main__': from salome_utils import positionVerbosityOfLoggerRegardingState,setVerboseLevel,setVerbose salome.standalone() -- 2.39.2