Salome HOME
[EDF29852] : test of mecanism of replay on error
authorAnthony Geay <anthony.geay@edf.fr>
Thu, 28 Mar 2024 11:44:52 +0000 (12:44 +0100)
committerAnthony Geay <anthony.geay@edf.fr>
Thu, 28 Mar 2024 11:44:52 +0000 (12:44 +0100)
idl/SALOME_Component.idl
src/Container/Container_i.cxx
src/Container/SALOME_Container_i.hxx
src/Container/SALOME_PyNode.py
src/Launcher/Test/testCrashProofContainer.py

index b3585af45a067c565b49780945ba00d207ca7235..f3143884467cb95447eedd64fbbbd75c4f5f2723 100644 (file)
@@ -65,6 +65,7 @@ module Engines
   typedef sequence<KeyValuePair> FieldsDict;
   typedef sequence<double> vectorOfDouble;
   typedef sequence<string> vectorOfString;
+  typedef sequence<vectorOfString> vectorOfVectorOfString;
 
   interface EngineComponent ;
   interface fileRef ;
@@ -85,6 +86,10 @@ module Engines
 
     FieldsDict get_os_environment();
 
+    void addLogFileNameGroup(in vectorOfString groupOfLogFileNames);
+    
+    vectorOfVectorOfString getAllLogFileNameGroups();
+
     void execute_python_code( in string code ) raises(SALOME::SALOME_Exception);
 
     /*! \brief Loads a new component class (dynamic library).
index 995b624442d28fac533666fe1eb1b24cafd55345..aee98c06c9f88c40f5455002f82623f8e8141d12 100644 (file)
@@ -1169,6 +1169,46 @@ Engines::FieldsDict *Abstract_Engines_Container_i::get_os_environment()
   return ret.release();
 }
 
+Engines::vectorOfString_var FromVecStringCppToCORBA( const std::vector<std::string>& group)
+{
+  Engines::vectorOfString_var ret( new Engines::vectorOfString );
+  auto sz( group.size() );
+  ret->length( sz );
+  for(auto i = 0 ; i < sz ; ++i)
+  {
+    ret[i] = CORBA::string_dup( group[i].c_str() );
+  }
+  return ret;
+}
+
+std::vector<std::string> FromCORBAVecStringToCpp(const Engines::vectorOfString& groupOfLogFileNames)
+{
+  auto len = groupOfLogFileNames.length();
+  std::vector<std::string> ret( len );
+  for( auto i = 0 ; i < len ; ++i )
+  {
+    ret[i] = groupOfLogFileNames[i];
+  }
+  return ret;
+}
+
+void Abstract_Engines_Container_i::addLogFileNameGroup(const Engines::vectorOfString& groupOfLogFileNames)
+{
+  this->_groups_of_log_files.push_back( FromCORBAVecStringToCpp(groupOfLogFileNames) );
+}
+    
+Engines::vectorOfVectorOfString *Abstract_Engines_Container_i::getAllLogFileNameGroups()
+{
+  std::unique_ptr<Engines::vectorOfVectorOfString> ret( new Engines::vectorOfVectorOfString );
+  auto nbOfGrps = this->_groups_of_log_files.size();
+  ret->length( nbOfGrps );
+  for(auto i = 0 ; i < nbOfGrps ; ++i)
+  {
+    (*ret)[i] = FromVecStringCppToCORBA( _groups_of_log_files[i] );
+  }
+  return ret.release();
+}
+
 void Abstract_Engines_Container_i::execute_python_code(const char *code)
 {
   AutoGIL gstate;
index aa7fd5a83e8508168e833488e371b834b4e64f8a..81643dc17cbf12349b3d5980181d20902fd497ba 100644 (file)
@@ -47,6 +47,7 @@
 #include <map>
 #include <list>
 #include <string>
+#include <vector>
 
 class SALOME_NamingService_Container_Abstract;
 
@@ -80,6 +81,10 @@ public:
   void override_environment( const Engines::FieldsDict& env ) override;
 
   Engines::FieldsDict *get_os_environment() override;
+
+  void addLogFileNameGroup(const Engines::vectorOfString& groupOfLogFileNames) override;
+    
+  Engines::vectorOfVectorOfString *getAllLogFileNameGroups() override;
   
   void execute_python_code(const char *code) override;
 
@@ -203,6 +208,7 @@ protected:
   Utils_Mutex _mutexForDftPy;
   std::list<std::string> _tmp_files;
   Engines::fileTransfer_var _fileTransfer;
+  std::vector< std::vector<std::string> > _groups_of_log_files;
 
   int _argc;
   char **_argv;
index 02cfb9188695d6bb75e76ea9f340c728734dd005..26493bf8145eccf234e0c7d1f425e49be5bb728b 100644 (file)
@@ -754,10 +754,15 @@ class PythonFunctionEvaluatorParams:
     for fileToDestroy in [self._main_filename,self._code_filename,self._in_context_filename,self._out_context_filename]:
       if os.path.exists( fileToDestroy ):
         os.unlink( fileToDestroy )
-  def destroyOnKO(self):
+  def destroyOnKO(self, containerRef):
+     """
+     Called in the context of failure with replay mode activated
+     """
      for fileToDestroy in [self._out_context_filename]:
       if os.path.exists( fileToDestroy ):
         os.unlink( fileToDestroy )
+      # register to container files group associated to the
+      containerRef.addLogFileNameGroup([self._main_filename,self._code_filename,self._in_context_filename])
   @property
   def replayCmd(self):
     return "To replay : ( cd {} && python3 {} )".format(os.path.dirname(self._main_filename),os.path.basename(self._main_filename))
@@ -787,7 +792,7 @@ Looks like a hard crash as returnCode {returnCode} != 0
 {banner}
 """
 
-def ExecCrashProofGeneric( code, context, outargsname, instanceOfLogOfCurrentSession, keepFilesToReplay ):
+def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, keepFilesToReplay ):
   """
   Equivalent of exec(code,context) but executed in a separate subprocess to avoid to make the current process crash.
   
@@ -796,6 +801,8 @@ def ExecCrashProofGeneric( code, context, outargsname, instanceOfLogOfCurrentSes
 
   code (str) : python code to be executed using context
   context (dict) : context to be used for execution. This context will be updated in accordance with the execution of code.
+  outargsname (list<str>) : list of arguments to be exported 
+  containerRef (Engines.Container) : Container ref (retrieving the Files to created when keepFilesToReplay is set to False)
   instanceOfLogOfCurrentSession (LogOfCurrentExecutionSession) : instance of LogOfCurrentExecutionSession to build remotely the reference in order to log information
   keepFilesToReplay (bool) : if True when something goes wrong during execution all the files to replay post mortem case are kept. If False only error is reported but files to replay are destoyed.
 
@@ -804,6 +811,10 @@ def ExecCrashProofGeneric( code, context, outargsname, instanceOfLogOfCurrentSes
 
   ScriptExecInfo : instance serverside
 
+  In/Out:
+  -------
+
+  context will be modified by this method. elts in outargsname will be added and their corresponding value coming from evaluation.
   """
   import tempfile
   import pickle
@@ -847,18 +858,18 @@ def ExecCrashProofGeneric( code, context, outargsname, instanceOfLogOfCurrentSes
     return ret
   if returnCode != 0:
     if keepFilesToReplay:
-      evParams.destroyOnKO()
+      evParams.destroyOnKO( containerRef )
     else:
       evParams.destroyOnOK()
     raise RuntimeError(f"Subprocess launched {evParams.strDependingOnReturnCode(keepFilesToReplay,returnCode)}stdout :\n{stdout}\nstderr :\n{stderr}")
 
-def ExecCrashProofWithReplay( code, context, outargsname, instanceOfLogOfCurrentSession ):
-  return ExecCrashProofGeneric(code, context, outargsname, instanceOfLogOfCurrentSession, True)
+def ExecCrashProofWithReplay( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ):
+  return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, True)
 
-def ExecCrashProofWithoutReplay( code, context, outargsname, instanceOfLogOfCurrentSession ):
-  return ExecCrashProofGeneric(code, context, outargsname, instanceOfLogOfCurrentSession, False)
+def ExecCrashProofWithoutReplay( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ):
+  return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, False)
 
-def ExecLocal( code, context, outargsname, instanceOfLogOfCurrentSession ):
+def ExecLocal( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ):
   exec( code, context )
   return instanceOfLogOfCurrentSession._current_instance
 
@@ -1087,18 +1098,18 @@ class PyScriptNode_i(PyScriptNode_Abstract_i):
     super().__init__(nodeName, code, poa, my_container, logscript)
 
   def executeNow(self, outargsname):
-    return ExecLocal(self.ccode,self.context,outargsname,self._current_execution_session)
+    return ExecLocal(self.ccode,self.context,outargsname,self.my_container,self._current_execution_session)
     
 class PyScriptNode_OutOfProcess_i(PyScriptNode_Abstract_i):
   def __init__(self, nodeName, code, poa, my_container, logscript):
     super().__init__(nodeName, code, poa, my_container, logscript)
 
   def executeNow(self, outargsname):
-    return ExecCrashProofWithoutReplay(self.code,self.context,outargsname,self._current_execution_session)
+    return ExecCrashProofWithoutReplay(self.code,self.context,outargsname,self.my_container,self._current_execution_session)
 
 class PyScriptNode_OutOfProcess_Replay_i(PyScriptNode_Abstract_i):
   def __init__(self, nodeName, code, poa, my_container, logscript):
     super().__init__(nodeName, code, poa, my_container, logscript)
 
   def executeNow(self, outargsname):
-    return ExecCrashProofWithReplay(self.code,self.context,outargsname,self._current_execution_session)
+    return ExecCrashProofWithReplay(self.code,self.context,outargsname,self.my_container,self._current_execution_session)
index cb17204452fe58976c4bb4069bab05517f03003e..027cf790473a22590bd6eb9c96b52b53621a6274 100644 (file)
@@ -32,11 +32,13 @@ import pickle
 import tempfile
 import logging
 from datetime import datetime
-
+import subprocess as sp
 
 killMeCode = """
 import os
+import sys
 j = 7 * i
+sys.stdout.write(str(j)) ; sys.stdout.flush() # the aime of test in replay mode to be sure that case is runnable
 os.kill( os.getpid() , signal.SIGKILL)# the aim of test is here
 """
 
@@ -46,7 +48,7 @@ my_log_4_this_session.addFreestyleAndFlush( ("a",777) ) # to check that hidden v
 """
 
 class testPerfLogManager1(unittest.TestCase):
-    def tess0(self):
+    def test0(self):
         """
         EDF29852 : Kill container with OutOfProcessNoReplay mode and see if container still responds.
         """
@@ -98,6 +100,20 @@ class testPerfLogManager1(unittest.TestCase):
         self.assertEqual(ret,24) # container has received a SIGKILL but it kindly continue to respond :)
         a = salome.logm.NaiveFetch()
         self.assertEqual(a[0][2][0].get().freestyle,[('a',777)])
+        grpsOfLogToKill = cont.getAllLogFileNameGroups()
+        self.assertEqual(1,len(grpsOfLogToKill))
+        replayInput = grpsOfLogToKill[0]
+        # now try to replay the failing case
+        p = sp.Popen(["python3",os.path.basename(replayInput[0])],cwd = os.path.dirname(replayInput[0]),stdout=sp.PIPE,stderr=sp.PIPE)
+        out,err = p.communicate()
+        self.assertEqual(1,p.returncode) # very important ! The failing case must continue to fail :)
+        self.assertEqual("21".encode(),out) # very important to check that the reported case is standalone enough to be replayable poste mortem
+        # cleanup
+        dn = os.path.dirname(replayInput[0])
+        for elt in replayInput:
+            zeFile = os.path.join( dn, os.path.basename(elt) )
+            if os.path.exists( zeFile ):
+                os.unlink( zeFile )
         cont.Shutdown()
 
 if __name__ == '__main__':