From: Anthony Geay Date: Thu, 28 Mar 2024 11:44:52 +0000 (+0100) Subject: [EDF29852] : test of mecanism of replay on error X-Git-Url: http://git.salome-platform.org/gitweb/?a=commitdiff_plain;h=98cc28403d9d665489d8b393229bdeeebf337a09;p=modules%2Fkernel.git [EDF29852] : test of mecanism of replay on error --- diff --git a/idl/SALOME_Component.idl b/idl/SALOME_Component.idl index b3585af45..f31438844 100644 --- a/idl/SALOME_Component.idl +++ b/idl/SALOME_Component.idl @@ -65,6 +65,7 @@ module Engines typedef sequence FieldsDict; typedef sequence vectorOfDouble; typedef sequence vectorOfString; + typedef sequence vectorOfVectorOfString; interface EngineComponent ; interface fileRef ; @@ -85,6 +86,10 @@ module Engines FieldsDict get_os_environment(); + void addLogFileNameGroup(in vectorOfString groupOfLogFileNames); + + vectorOfVectorOfString getAllLogFileNameGroups(); + void execute_python_code( in string code ) raises(SALOME::SALOME_Exception); /*! \brief Loads a new component class (dynamic library). diff --git a/src/Container/Container_i.cxx b/src/Container/Container_i.cxx index 995b62444..aee98c06c 100644 --- a/src/Container/Container_i.cxx +++ b/src/Container/Container_i.cxx @@ -1169,6 +1169,46 @@ Engines::FieldsDict *Abstract_Engines_Container_i::get_os_environment() return ret.release(); } +Engines::vectorOfString_var FromVecStringCppToCORBA( const std::vector& group) +{ + Engines::vectorOfString_var ret( new Engines::vectorOfString ); + auto sz( group.size() ); + ret->length( sz ); + for(auto i = 0 ; i < sz ; ++i) + { + ret[i] = CORBA::string_dup( group[i].c_str() ); + } + return ret; +} + +std::vector FromCORBAVecStringToCpp(const Engines::vectorOfString& groupOfLogFileNames) +{ + auto len = groupOfLogFileNames.length(); + std::vector ret( len ); + for( auto i = 0 ; i < len ; ++i ) + { + ret[i] = groupOfLogFileNames[i]; + } + return ret; +} + +void Abstract_Engines_Container_i::addLogFileNameGroup(const Engines::vectorOfString& groupOfLogFileNames) +{ + this->_groups_of_log_files.push_back( FromCORBAVecStringToCpp(groupOfLogFileNames) ); +} + +Engines::vectorOfVectorOfString *Abstract_Engines_Container_i::getAllLogFileNameGroups() +{ + std::unique_ptr ret( new Engines::vectorOfVectorOfString ); + auto nbOfGrps = this->_groups_of_log_files.size(); + ret->length( nbOfGrps ); + for(auto i = 0 ; i < nbOfGrps ; ++i) + { + (*ret)[i] = FromVecStringCppToCORBA( _groups_of_log_files[i] ); + } + return ret.release(); +} + void Abstract_Engines_Container_i::execute_python_code(const char *code) { AutoGIL gstate; diff --git a/src/Container/SALOME_Container_i.hxx b/src/Container/SALOME_Container_i.hxx index aa7fd5a83..81643dc17 100644 --- a/src/Container/SALOME_Container_i.hxx +++ b/src/Container/SALOME_Container_i.hxx @@ -47,6 +47,7 @@ #include #include #include +#include class SALOME_NamingService_Container_Abstract; @@ -80,6 +81,10 @@ public: void override_environment( const Engines::FieldsDict& env ) override; Engines::FieldsDict *get_os_environment() override; + + void addLogFileNameGroup(const Engines::vectorOfString& groupOfLogFileNames) override; + + Engines::vectorOfVectorOfString *getAllLogFileNameGroups() override; void execute_python_code(const char *code) override; @@ -203,6 +208,7 @@ protected: Utils_Mutex _mutexForDftPy; std::list _tmp_files; Engines::fileTransfer_var _fileTransfer; + std::vector< std::vector > _groups_of_log_files; int _argc; char **_argv; diff --git a/src/Container/SALOME_PyNode.py b/src/Container/SALOME_PyNode.py index 02cfb9188..26493bf81 100644 --- a/src/Container/SALOME_PyNode.py +++ b/src/Container/SALOME_PyNode.py @@ -754,10 +754,15 @@ class PythonFunctionEvaluatorParams: for fileToDestroy in [self._main_filename,self._code_filename,self._in_context_filename,self._out_context_filename]: if os.path.exists( fileToDestroy ): os.unlink( fileToDestroy ) - def destroyOnKO(self): + def destroyOnKO(self, containerRef): + """ + Called in the context of failure with replay mode activated + """ for fileToDestroy in [self._out_context_filename]: if os.path.exists( fileToDestroy ): os.unlink( fileToDestroy ) + # register to container files group associated to the + containerRef.addLogFileNameGroup([self._main_filename,self._code_filename,self._in_context_filename]) @property def replayCmd(self): return "To replay : ( cd {} && python3 {} )".format(os.path.dirname(self._main_filename),os.path.basename(self._main_filename)) @@ -787,7 +792,7 @@ Looks like a hard crash as returnCode {returnCode} != 0 {banner} """ -def ExecCrashProofGeneric( code, context, outargsname, instanceOfLogOfCurrentSession, keepFilesToReplay ): +def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, keepFilesToReplay ): """ Equivalent of exec(code,context) but executed in a separate subprocess to avoid to make the current process crash. @@ -796,6 +801,8 @@ def ExecCrashProofGeneric( code, context, outargsname, instanceOfLogOfCurrentSes code (str) : python code to be executed using context context (dict) : context to be used for execution. This context will be updated in accordance with the execution of code. + outargsname (list) : list of arguments to be exported + containerRef (Engines.Container) : Container ref (retrieving the Files to created when keepFilesToReplay is set to False) instanceOfLogOfCurrentSession (LogOfCurrentExecutionSession) : instance of LogOfCurrentExecutionSession to build remotely the reference in order to log information keepFilesToReplay (bool) : if True when something goes wrong during execution all the files to replay post mortem case are kept. If False only error is reported but files to replay are destoyed. @@ -804,6 +811,10 @@ def ExecCrashProofGeneric( code, context, outargsname, instanceOfLogOfCurrentSes ScriptExecInfo : instance serverside + In/Out: + ------- + + context will be modified by this method. elts in outargsname will be added and their corresponding value coming from evaluation. """ import tempfile import pickle @@ -847,18 +858,18 @@ def ExecCrashProofGeneric( code, context, outargsname, instanceOfLogOfCurrentSes return ret if returnCode != 0: if keepFilesToReplay: - evParams.destroyOnKO() + evParams.destroyOnKO( containerRef ) else: evParams.destroyOnOK() raise RuntimeError(f"Subprocess launched {evParams.strDependingOnReturnCode(keepFilesToReplay,returnCode)}stdout :\n{stdout}\nstderr :\n{stderr}") -def ExecCrashProofWithReplay( code, context, outargsname, instanceOfLogOfCurrentSession ): - return ExecCrashProofGeneric(code, context, outargsname, instanceOfLogOfCurrentSession, True) +def ExecCrashProofWithReplay( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ): + return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, True) -def ExecCrashProofWithoutReplay( code, context, outargsname, instanceOfLogOfCurrentSession ): - return ExecCrashProofGeneric(code, context, outargsname, instanceOfLogOfCurrentSession, False) +def ExecCrashProofWithoutReplay( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ): + return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, False) -def ExecLocal( code, context, outargsname, instanceOfLogOfCurrentSession ): +def ExecLocal( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ): exec( code, context ) return instanceOfLogOfCurrentSession._current_instance @@ -1087,18 +1098,18 @@ class PyScriptNode_i(PyScriptNode_Abstract_i): super().__init__(nodeName, code, poa, my_container, logscript) def executeNow(self, outargsname): - return ExecLocal(self.ccode,self.context,outargsname,self._current_execution_session) + return ExecLocal(self.ccode,self.context,outargsname,self.my_container,self._current_execution_session) class PyScriptNode_OutOfProcess_i(PyScriptNode_Abstract_i): def __init__(self, nodeName, code, poa, my_container, logscript): super().__init__(nodeName, code, poa, my_container, logscript) def executeNow(self, outargsname): - return ExecCrashProofWithoutReplay(self.code,self.context,outargsname,self._current_execution_session) + return ExecCrashProofWithoutReplay(self.code,self.context,outargsname,self.my_container,self._current_execution_session) class PyScriptNode_OutOfProcess_Replay_i(PyScriptNode_Abstract_i): def __init__(self, nodeName, code, poa, my_container, logscript): super().__init__(nodeName, code, poa, my_container, logscript) def executeNow(self, outargsname): - return ExecCrashProofWithReplay(self.code,self.context,outargsname,self._current_execution_session) + return ExecCrashProofWithReplay(self.code,self.context,outargsname,self.my_container,self._current_execution_session) diff --git a/src/Launcher/Test/testCrashProofContainer.py b/src/Launcher/Test/testCrashProofContainer.py index cb1720445..027cf7904 100644 --- a/src/Launcher/Test/testCrashProofContainer.py +++ b/src/Launcher/Test/testCrashProofContainer.py @@ -32,11 +32,13 @@ import pickle import tempfile import logging from datetime import datetime - +import subprocess as sp killMeCode = """ import os +import sys j = 7 * i +sys.stdout.write(str(j)) ; sys.stdout.flush() # the aime of test in replay mode to be sure that case is runnable os.kill( os.getpid() , signal.SIGKILL)# the aim of test is here """ @@ -46,7 +48,7 @@ my_log_4_this_session.addFreestyleAndFlush( ("a",777) ) # to check that hidden v """ class testPerfLogManager1(unittest.TestCase): - def tess0(self): + def test0(self): """ EDF29852 : Kill container with OutOfProcessNoReplay mode and see if container still responds. """ @@ -98,6 +100,20 @@ class testPerfLogManager1(unittest.TestCase): self.assertEqual(ret,24) # container has received a SIGKILL but it kindly continue to respond :) a = salome.logm.NaiveFetch() self.assertEqual(a[0][2][0].get().freestyle,[('a',777)]) + grpsOfLogToKill = cont.getAllLogFileNameGroups() + self.assertEqual(1,len(grpsOfLogToKill)) + replayInput = grpsOfLogToKill[0] + # now try to replay the failing case + p = sp.Popen(["python3",os.path.basename(replayInput[0])],cwd = os.path.dirname(replayInput[0]),stdout=sp.PIPE,stderr=sp.PIPE) + out,err = p.communicate() + self.assertEqual(1,p.returncode) # very important ! The failing case must continue to fail :) + self.assertEqual("21".encode(),out) # very important to check that the reported case is standalone enough to be replayable poste mortem + # cleanup + dn = os.path.dirname(replayInput[0]) + for elt in replayInput: + zeFile = os.path.join( dn, os.path.basename(elt) ) + if os.path.exists( zeFile ): + os.unlink( zeFile ) cont.Shutdown() if __name__ == '__main__':