From: Anthony Geay Date: Wed, 5 Jun 2024 07:48:18 +0000 (+0200) Subject: [EDF29150] : Put additionnal fault tolerant mecanism X-Git-Url: http://git.salome-platform.org/gitweb/?a=commitdiff_plain;h=a0add1125944fbde84c59ba2263686a26b99c493;p=modules%2Fkernel.git [EDF29150] : Put additionnal fault tolerant mecanism --- diff --git a/idl/SALOME_Component.idl b/idl/SALOME_Component.idl index 04ac5b340..2d7d3ea5b 100644 --- a/idl/SALOME_Component.idl +++ b/idl/SALOME_Component.idl @@ -89,6 +89,8 @@ module Engines void set_big_obj_on_disk_threshold(in long thresholdInByte); void set_big_obj_on_disk_directory(in string directory); + + void set_number_of_retry(in long nbRetry); void addLogFileNameGroup(in vectorOfString groupOfLogFileNames); diff --git a/idl/SALOME_ContainerManager.idl b/idl/SALOME_ContainerManager.idl index 32d9c6676..110c5b419 100644 --- a/idl/SALOME_ContainerManager.idl +++ b/idl/SALOME_ContainerManager.idl @@ -112,6 +112,10 @@ interface ContainerManager void SetBigObjOnDiskDirectory(in string directory); + void SetNumberOfRetry(in long nbRetry); + + long GetNumberOfRetry(); + void SetCodeOnContainerStartUp(in string code); string GetCodeOnContainerStartUp(); diff --git a/src/Basics/KernelBasis.cxx b/src/Basics/KernelBasis.cxx index 5789721ad..c236f0020 100644 --- a/src/Basics/KernelBasis.cxx +++ b/src/Basics/KernelBasis.cxx @@ -155,6 +155,10 @@ void SALOME::SetBigObjOnDiskThreshold(int newThresholdInByte) static std::string SALOME_FILE_BIG_OBJ_DIR; +constexpr int DFT_SALOME_NB_RETRY = 1; + +static int SALOME_NB_RETRY = DFT_SALOME_NB_RETRY; + std::string SALOME::GetBigObjOnDiskDirectory() { return SALOME_FILE_BIG_OBJ_DIR; @@ -170,6 +174,16 @@ bool SALOME::BigObjOnDiskDirectoryDefined() return ! SALOME_FILE_BIG_OBJ_DIR.empty(); } +void SALOME::SetNumberOfRetry(int nbRetry) +{ + SALOME_NB_RETRY = nbRetry; +} + +int SALOME::GetNumberOfRetry() +{ + return SALOME_NB_RETRY; +} + static SALOME::PyExecutionMode DefaultPyExecMode = SALOME::PyExecutionMode::NotSet; void SALOME::SetPyExecutionMode(PyExecutionMode mode) diff --git a/src/Basics/KernelBasis.hxx b/src/Basics/KernelBasis.hxx index 8399fc76f..d2a7bb222 100644 --- a/src/Basics/KernelBasis.hxx +++ b/src/Basics/KernelBasis.hxx @@ -48,4 +48,6 @@ namespace SALOME std::string BASICS_EXPORT GetBigObjOnDiskDirectory(); void BASICS_EXPORT SetBigObjOnDiskDirectory(const std::string& directory); bool BASICS_EXPORT BigObjOnDiskDirectoryDefined(); + void BASICS_EXPORT SetNumberOfRetry(int nbRetry); + int BASICS_EXPORT GetNumberOfRetry(); } diff --git a/src/Basics/KernelBasis.i b/src/Basics/KernelBasis.i index 092c7537f..6c5e8536f 100644 --- a/src/Basics/KernelBasis.i +++ b/src/Basics/KernelBasis.i @@ -56,6 +56,8 @@ using namespace SALOME; %rename (GetBigObjOnDiskDirectory) GetBigObjOnDiskDirectorySwig; %rename (SetBigObjOnDiskDirectory) SetBigObjOnDiskDirectorySwig; %rename (BigObjOnDiskDirectoryDefined) BigObjOnDiskDirectoryDefinedSwig; +%rename (SetNumberOfRetry) SetNumberOfRetrySwig; +%rename (GetNumberOfRetry) GetNumberOfRetrySwig; bool getSSLMode(); void setSSLMode(bool sslMode); @@ -142,6 +144,16 @@ bool BigObjOnDiskDirectoryDefinedSwig() return SALOME::BigObjOnDiskDirectoryDefined(); } +void SetNumberOfRetrySwig(int nbRetry) +{ + SALOME::SetNumberOfRetry( nbRetry ); +} + +int GetNumberOfRetrySwig() +{ + return SALOME::GetNumberOfRetry( ); +} + void SetVerbosityLevelSwig(const std::string& level) { SetVerbosityLevelStr(level); diff --git a/src/Container/Container_i.cxx b/src/Container/Container_i.cxx index a12edd4ff..4e0d5fe93 100644 --- a/src/Container/Container_i.cxx +++ b/src/Container/Container_i.cxx @@ -1180,6 +1180,11 @@ void Abstract_Engines_Container_i::set_big_obj_on_disk_directory(const char *dir SALOME::SetBigObjOnDiskDirectory(directory); } +void Abstract_Engines_Container_i::set_number_of_retry(CORBA::Long nbRetry) +{ + SALOME::SetNumberOfRetry( nbRetry ); +} + Engines::vectorOfString_var FromVecStringCppToCORBA( const std::vector& group) { Engines::vectorOfString_var ret( new Engines::vectorOfString ); diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index 391c4076d..2ee5639bc 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -246,6 +246,16 @@ void SALOME_ContainerManager::SetBigObjOnDiskDirectory(const char *directory) SALOME::SetBigObjOnDiskDirectory(directory); } + void SALOME_ContainerManager::SetNumberOfRetry(CORBA::Long nbRetry) + { + SALOME::SetNumberOfRetry( nbRetry ); + } + +CORBA::Long SALOME_ContainerManager::GetNumberOfRetry() +{ + return SALOME::GetNumberOfRetry(); +} + //============================================================================= //! Loop on all the containers listed in naming service, ask shutdown on each /*! CORBA Method: @@ -541,6 +551,7 @@ Engines::Container_ptr SALOME_ContainerManager::GiveContainer(const Engines::Con INFOS("[GiveContainer] container " << containerNameInNS << " override " << envInfo.str()); cont->set_big_obj_on_disk_directory( SALOME::GetBigObjOnDiskDirectory().c_str() ); cont->set_big_obj_on_disk_threshold( SALOME::GetBigObjOnDiskThreshold() ); + cont->set_number_of_retry( SALOME::GetNumberOfRetry() ); Engines::FieldsDict envCorba; { auto sz = _override_env.size(); diff --git a/src/Container/SALOME_ContainerManager.hxx b/src/Container/SALOME_ContainerManager.hxx index 6119ec393..a02bd9258 100644 --- a/src/Container/SALOME_ContainerManager.hxx +++ b/src/Container/SALOME_ContainerManager.hxx @@ -83,6 +83,10 @@ public: void SetBigObjOnDiskDirectory(const char *directory) override; + void SetNumberOfRetry(CORBA::Long nbRetry) override; + + CORBA::Long GetNumberOfRetry() override; + static const char *_ContainerManagerNameInNS; private: diff --git a/src/Container/SALOME_Container_i.hxx b/src/Container/SALOME_Container_i.hxx index 4ade76e5b..08c174254 100644 --- a/src/Container/SALOME_Container_i.hxx +++ b/src/Container/SALOME_Container_i.hxx @@ -86,6 +86,8 @@ public: void set_big_obj_on_disk_directory(const char *directory) override; + void set_number_of_retry(CORBA::Long nbRetry) override; + void addLogFileNameGroup(const Engines::vectorOfString& groupOfLogFileNames) override; Engines::vectorOfVectorOfString *getAllLogFileNameGroups() override; diff --git a/src/Container/SALOME_PyNode.py b/src/Container/SALOME_PyNode.py index 0069e15c7..0b0c9b3cd 100644 --- a/src/Container/SALOME_PyNode.py +++ b/src/Container/SALOME_PyNode.py @@ -562,6 +562,9 @@ class GenericPythonMonitoringLauncherCtxMgr: def __exit__(self,exctype, exc, tb): StopMonitoring( self._monitoring_params ) + del self._monitoring_params + import gc + gc.collect() # force destruction of objects even in raise context def StopMonitoring( monitoringInfo ): """ @@ -732,6 +735,9 @@ with open(inputFileName,"rb") as f: context[MY_PERFORMANCE_LOG_ENTRY_IN_GLBS] = eval( MY_PERFORMANCE_LOG_ENTRY_IN_GLBS ) with open(codeFileName,"r") as f: code = f.read() +# +import gc +gc.disable() # go for execution exec( code , context ) # filter part of context to be exported to father process @@ -838,6 +844,7 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL # def InternalExecResistant( code, context, outargsname): + import KernelBasis orb = CORBA.ORB_init(['']) iorScriptLog = orb.object_to_string( instanceOfLogOfCurrentSession._remote_handle )#ref ContainerScriptPerfLog_ptr #### @@ -860,9 +867,14 @@ sys.stderr.flush()""".format( MY_KEY_TO_DETECT_FINISH ) ) mainExecFileName = os.path.abspath( "mainexecsafe_{}.py".format( RetrieveUniquePartFromPfx( codeFileName ) ) ) with open(mainExecFileName,"w") as f: f.write( FinalCode.format( codeFileName, contextFileName, resFileName, outargsname, iorScriptLog ) ) - p = sp.Popen(["python3", mainExecFileName],stdout = sp.PIPE, stderr = sp.PIPE) - stdout, stderr = p.communicate() - returnCode = p.returncode + for iTry in range( KernelBasis.GetNumberOfRetry() ): + if iTry > 0: + print( "WARNING : Retry # {}. Following code has generated non zero return code ( {} ). Trying again ... \n{}".format( iTry, returnCode, code ) ) + p = sp.Popen(["python3", mainExecFileName],stdout = sp.PIPE, stderr = sp.PIPE) + stdout, stderr = p.communicate() + returnCode = p.returncode + if returnCode == 0: + break return returnCode, stdout, stderr, PythonFunctionEvaluatorParams(mainExecFileName,codeFileName,contextFileName,resFileName) ret = instanceOfLogOfCurrentSession._current_instance returnCode, stdout, stderr, evParams = InternalExecResistant( code, context, outargsname ) @@ -924,7 +936,13 @@ class LogOfCurrentExecutionSession(LogOfCurrentExecutionSessionAbs): self.finalizeAndPushToMaster() def finalizeAndPushToMaster(self): - self._remote_handle.assign( pickle.dumps( self._current_instance ) ) + """ + Voluntary do nothing in case of problem to avoid to trouble execution + """ + try: + self._remote_handle.assign( pickle.dumps( self._current_instance ) ) + except: + pass class LogOfCurrentExecutionSessionStub(LogOfCurrentExecutionSessionAbs): """ @@ -1049,17 +1067,20 @@ class PyScriptNode_Abstract_i(Engines__POA.PyScriptNode,Generic,abc.ABC): def executeSecond(self,outargsname): """ Same than second part of self.execute to reduce memory peak.""" + def executeSecondInternal(monitoringtimeresms): + with GenericPythonMonitoringLauncherCtxMgr( CPUMemoryMonitoring( monitoringtimeresms ) ) as monitoringParams: + currentInstance = self.executeNow( outargsname ) + cpumeminfo = ReadCPUMemInfo( monitoringParams ) + return cpumeminfo, currentInstance + import sys try: self.addTimeInfoOnLevel2("startExecTime") ## self.addInfoOnLevel2("measureTimeResolution",self.my_container_py.monitoringtimeresms()) - with GenericPythonMonitoringLauncherCtxMgr( CPUMemoryMonitoring( self.my_container_py.monitoringtimeresms() ) ) as monitoringParams: - self._current_execution_session._current_instance = self.executeNow( outargsname ) - cpumeminfo = ReadCPUMemInfo( monitoringParams ) + cpumeminfo, self._current_execution_session._current_instance = executeSecondInternal( self.my_container_py.monitoringtimeresms() ) ## self.addInfoOnLevel2("CPUMemDuringExec",cpumeminfo) - del monitoringParams self.addTimeInfoOnLevel2("endExecTime") self.addTimeInfoOnLevel2("startOutputTime") argsout=[] diff --git a/src/Launcher/Test/testCrashProofContainer.py b/src/Launcher/Test/testCrashProofContainer.py index 6b0ca88a8..0d2a168fe 100644 --- a/src/Launcher/Test/testCrashProofContainer.py +++ b/src/Launcher/Test/testCrashProofContainer.py @@ -37,6 +37,7 @@ import subprocess as sp killMeCode = """ import os import sys +import signal j = 7 * i sys.stdout.write(str(j)) ; sys.stdout.flush() # the aime of test in replay mode to be sure that case is runnable os.kill( os.getpid() , signal.SIGKILL)# the aim of test is here @@ -127,7 +128,7 @@ class testPerfLogManager1(unittest.TestCase): # now try to replay the failing case p = sp.Popen(["python3",os.path.basename(replayInput[0])],cwd = os.path.dirname(replayInput[0]),stdout=sp.PIPE,stderr=sp.PIPE) out,err = p.communicate() - self.assertEqual(1,p.returncode) # very important ! The failing case must continue to fail :) + self.assertNotEqual(p.returncode,0) # very important ! The failing case must continue to fail :) self.assertEqual("21".encode(),out) # very important to check that the reported case is standalone enough to be replayable poste mortem # cleanup dn = os.path.dirname(replayInput[0]) @@ -177,6 +178,7 @@ class testPerfLogManager1(unittest.TestCase): KernelBasis.SetPyExecutionMode("OutOfProcessWithReplayFT") hostname = "localhost" cp = pylauncher.GetRequestForGiveContainer(hostname,"container_crash_test") + salome.cm.SetNumberOfRetry( 3 ) salome.cm.SetBigObjOnDiskThreshold(1000) salome.cm.SetOverrideEnvForContainersSimple(env = []) cont = salome.cm.GiveContainer(cp) @@ -188,7 +190,9 @@ class testPerfLogManager1(unittest.TestCase): ret = pickle.loads( SALOME_PyNode.SeqByteReceiver(ret[0]).data() ) self.assertEqual(ret,27) with open(cont.locallogfilename) as f: - self.assertTrue( "WARNING : Following code has generated non zero return code" in f.read() )# should report something into the container + logCont = f.read( ) + self.assertTrue( "WARNING : Retry #" in logCont) + self.assertTrue( "WARNING : Following code has generated non zero return code" in logCont )# should report something into the container cont.Shutdown() if __name__ == '__main__':