From 5abef8637180921342be64ec654c662987f50c6d Mon Sep 17 00:00:00 2001 From: Anthony Geay Date: Fri, 31 May 2024 18:32:46 +0200 Subject: [PATCH] [EDF30062] [EDF29150]: Additional fault tolerant mecanism --- idl/SALOME_Component.idl | 2 + idl/SALOME_ContainerManager.idl | 4 + src/Basics/KernelBasis.cxx | 30 ++++++ src/Basics/KernelBasis.hxx | 4 +- src/Basics/KernelBasis.i | 12 +++ src/Container/CMakeLists.txt | 8 +- src/Container/Container_i.cxx | 5 + src/Container/SALOME_Container.py | 14 +++ src/Container/SALOME_ContainerManager.cxx | 15 +++ src/Container/SALOME_ContainerManager.hxx | 4 + src/Container/SALOME_Container_No_NS_Serv.cxx | 5 +- .../SALOME_Container_No_NS_Serv_Generic.hxx | 32 +++---- ...SALOME_Container_No_NS_Serv_OutProcess.cxx | 5 +- ...OME_Container_No_NS_Serv_OutProcess_FT.cxx | 25 +++++ ...Container_No_NS_Serv_OutProcess_Replay.cxx | 5 +- ...tainer_No_NS_Serv_OutProcess_Replay_FT.cxx | 25 +++++ src/Container/SALOME_Container_i.hxx | 29 +++++- src/Container/SALOME_PyNode.py | 91 ++++++++++++++++--- src/Launcher/Test/testCrashProofContainer.py | 41 ++++++++- 19 files changed, 320 insertions(+), 36 deletions(-) create mode 100644 src/Container/SALOME_Container_No_NS_Serv_OutProcess_FT.cxx create mode 100644 src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay_FT.cxx diff --git a/idl/SALOME_Component.idl b/idl/SALOME_Component.idl index 04ac5b340..2d7d3ea5b 100644 --- a/idl/SALOME_Component.idl +++ b/idl/SALOME_Component.idl @@ -89,6 +89,8 @@ module Engines void set_big_obj_on_disk_threshold(in long thresholdInByte); void set_big_obj_on_disk_directory(in string directory); + + void set_number_of_retry(in long nbRetry); void addLogFileNameGroup(in vectorOfString groupOfLogFileNames); diff --git a/idl/SALOME_ContainerManager.idl b/idl/SALOME_ContainerManager.idl index 32d9c6676..110c5b419 100644 --- a/idl/SALOME_ContainerManager.idl +++ b/idl/SALOME_ContainerManager.idl @@ -112,6 +112,10 @@ interface ContainerManager void SetBigObjOnDiskDirectory(in string directory); + void SetNumberOfRetry(in long nbRetry); + + long GetNumberOfRetry(); + void SetCodeOnContainerStartUp(in string code); string GetCodeOnContainerStartUp(); diff --git a/src/Basics/KernelBasis.cxx b/src/Basics/KernelBasis.cxx index 649f51556..c236f0020 100644 --- a/src/Basics/KernelBasis.cxx +++ b/src/Basics/KernelBasis.cxx @@ -81,6 +81,10 @@ namespace SALOME static constexpr char OUT_OF_PROCESS_NO_REPLAY_VALUE_STR[] = "OutOfProcessNoReplay"; static constexpr char OUT_OF_PROCESS_WITH_REPLAY_VALUE = 2; static constexpr char OUT_OF_PROCESS_WITH_REPLAY_VALUE_STR[] = "OutOfProcessWithReplay"; + static constexpr char OUT_OF_PROCESS_NO_REPLAY_FT_VALUE = 3; + static constexpr char OUT_OF_PROCESS_NO_REPLAY_FT_VALUE_STR[] = "OutOfProcessNoReplayFT"; + static constexpr char OUT_OF_PROCESS_WITH_REPLAY_FT_VALUE = 4; + static constexpr char OUT_OF_PROCESS_WITH_REPLAY_FT_VALUE_STR[] = "OutOfProcessWithReplayFT"; static PyExecutionMode FromIntToPyExecutionMode(char value) { @@ -92,6 +96,10 @@ namespace SALOME return PyExecutionMode::OutOfProcessNoReplay; case OUT_OF_PROCESS_WITH_REPLAY_VALUE: return PyExecutionMode::OutOfProcessWithReplay; + case OUT_OF_PROCESS_NO_REPLAY_FT_VALUE: + return PyExecutionMode::OutOfProcessNoReplayFT; + case OUT_OF_PROCESS_WITH_REPLAY_FT_VALUE: + return PyExecutionMode::OutOfProcessWithReplayFT; } throw std::range_error("FromIntToPyExecutionMode : Invalid value for Py Execution Mode ! Must be in 0 (InProcess), 1 (OutOfProcessNoReplay) or 2 (OutOfProcessWithReplay) !"); } @@ -104,6 +112,10 @@ namespace SALOME return PyExecutionMode::OutOfProcessNoReplay; if(value == OUT_OF_PROCESS_WITH_REPLAY_VALUE_STR) return PyExecutionMode::OutOfProcessWithReplay; + if(value == OUT_OF_PROCESS_NO_REPLAY_FT_VALUE_STR) + return PyExecutionMode::OutOfProcessNoReplayFT; + if(value == OUT_OF_PROCESS_WITH_REPLAY_FT_VALUE_STR) + return PyExecutionMode::OutOfProcessWithReplayFT; throw std::range_error("FromStrToPyExecutionMode : Invalid str value for py execution mode !"); } @@ -117,6 +129,10 @@ namespace SALOME return OUT_OF_PROCESS_NO_REPLAY_VALUE_STR; case PyExecutionMode::OutOfProcessWithReplay: return OUT_OF_PROCESS_WITH_REPLAY_VALUE_STR; + case PyExecutionMode::OutOfProcessNoReplayFT: + return OUT_OF_PROCESS_NO_REPLAY_FT_VALUE_STR; + case PyExecutionMode::OutOfProcessWithReplayFT: + return OUT_OF_PROCESS_WITH_REPLAY_FT_VALUE_STR; default: throw std::range_error("FromExecutionModeToStr : Invalid str value for py execution mode !"); } @@ -139,6 +155,10 @@ void SALOME::SetBigObjOnDiskThreshold(int newThresholdInByte) static std::string SALOME_FILE_BIG_OBJ_DIR; +constexpr int DFT_SALOME_NB_RETRY = 1; + +static int SALOME_NB_RETRY = DFT_SALOME_NB_RETRY; + std::string SALOME::GetBigObjOnDiskDirectory() { return SALOME_FILE_BIG_OBJ_DIR; @@ -154,6 +174,16 @@ bool SALOME::BigObjOnDiskDirectoryDefined() return ! SALOME_FILE_BIG_OBJ_DIR.empty(); } +void SALOME::SetNumberOfRetry(int nbRetry) +{ + SALOME_NB_RETRY = nbRetry; +} + +int SALOME::GetNumberOfRetry() +{ + return SALOME_NB_RETRY; +} + static SALOME::PyExecutionMode DefaultPyExecMode = SALOME::PyExecutionMode::NotSet; void SALOME::SetPyExecutionMode(PyExecutionMode mode) diff --git a/src/Basics/KernelBasis.hxx b/src/Basics/KernelBasis.hxx index cbecfe72e..d2a7bb222 100644 --- a/src/Basics/KernelBasis.hxx +++ b/src/Basics/KernelBasis.hxx @@ -37,7 +37,7 @@ void BASICS_EXPORT WriteInStderr(const std::string& msg); namespace SALOME { - enum class PyExecutionMode { NotSet, InProcess, OutOfProcessNoReplay, OutOfProcessWithReplay }; + enum class PyExecutionMode { NotSet, InProcess, OutOfProcessNoReplay, OutOfProcessWithReplay, OutOfProcessNoReplayFT, OutOfProcessWithReplayFT }; void BASICS_EXPORT SetPyExecutionMode(PyExecutionMode mode); void BASICS_EXPORT SetPyExecutionModeStr(const std::string& mode); std::vector BASICS_EXPORT GetAllPyExecutionModes(); @@ -48,4 +48,6 @@ namespace SALOME std::string BASICS_EXPORT GetBigObjOnDiskDirectory(); void BASICS_EXPORT SetBigObjOnDiskDirectory(const std::string& directory); bool BASICS_EXPORT BigObjOnDiskDirectoryDefined(); + void BASICS_EXPORT SetNumberOfRetry(int nbRetry); + int BASICS_EXPORT GetNumberOfRetry(); } diff --git a/src/Basics/KernelBasis.i b/src/Basics/KernelBasis.i index 092c7537f..6c5e8536f 100644 --- a/src/Basics/KernelBasis.i +++ b/src/Basics/KernelBasis.i @@ -56,6 +56,8 @@ using namespace SALOME; %rename (GetBigObjOnDiskDirectory) GetBigObjOnDiskDirectorySwig; %rename (SetBigObjOnDiskDirectory) SetBigObjOnDiskDirectorySwig; %rename (BigObjOnDiskDirectoryDefined) BigObjOnDiskDirectoryDefinedSwig; +%rename (SetNumberOfRetry) SetNumberOfRetrySwig; +%rename (GetNumberOfRetry) GetNumberOfRetrySwig; bool getSSLMode(); void setSSLMode(bool sslMode); @@ -142,6 +144,16 @@ bool BigObjOnDiskDirectoryDefinedSwig() return SALOME::BigObjOnDiskDirectoryDefined(); } +void SetNumberOfRetrySwig(int nbRetry) +{ + SALOME::SetNumberOfRetry( nbRetry ); +} + +int GetNumberOfRetrySwig() +{ + return SALOME::GetNumberOfRetry( ); +} + void SetVerbosityLevelSwig(const std::string& level) { SetVerbosityLevelStr(level); diff --git a/src/Container/CMakeLists.txt b/src/Container/CMakeLists.txt index 4eb507ecc..e61ebbb21 100644 --- a/src/Container/CMakeLists.txt +++ b/src/Container/CMakeLists.txt @@ -119,11 +119,17 @@ TARGET_LINK_LIBRARIES(SALOME_Container_No_NS_Serv_OutProcess SalomeContainerServ ADD_EXECUTABLE(SALOME_Container_No_NS_Serv_OutProcess_Replay SALOME_Container_No_NS_Serv_OutProcess_Replay.cxx) TARGET_LINK_LIBRARIES(SALOME_Container_No_NS_Serv_OutProcess_Replay SalomeContainerServer) +ADD_EXECUTABLE(SALOME_Container_No_NS_Serv_OutProcess_FT SALOME_Container_No_NS_Serv_OutProcess_FT.cxx) +TARGET_LINK_LIBRARIES(SALOME_Container_No_NS_Serv_OutProcess_FT SalomeContainerServer) + +ADD_EXECUTABLE(SALOME_Container_No_NS_Serv_OutProcess_Replay_FT SALOME_Container_No_NS_Serv_OutProcess_Replay_FT.cxx) +TARGET_LINK_LIBRARIES(SALOME_Container_No_NS_Serv_OutProcess_Replay_FT SalomeContainerServer) + IF(SALOME_BUILD_TESTS) ADD_EXECUTABLE(TestSalome_file TestSalome_file.cxx) TARGET_LINK_LIBRARIES(TestSalome_file SALOMETraceCollectorTest ${SALOME_Container_LIBS}) ENDIF() -INSTALL(TARGETS SALOME_Container SALOME_Container_No_NS_Serv SALOME_Container_No_NS_Serv_OutProcess SALOME_Container_No_NS_Serv_OutProcess_Replay DESTINATION ${SALOME_INSTALL_BINS}) +INSTALL(TARGETS SALOME_Container SALOME_Container_No_NS_Serv SALOME_Container_No_NS_Serv_OutProcess SALOME_Container_No_NS_Serv_OutProcess_Replay SALOME_Container_No_NS_Serv_OutProcess_FT SALOME_Container_No_NS_Serv_OutProcess_Replay_FT DESTINATION ${SALOME_INSTALL_BINS}) # Executable scripts to be installed SALOME_INSTALL_SCRIPTS("${SCRIPTS}" ${SALOME_INSTALL_SCRIPT_PYTHON}) diff --git a/src/Container/Container_i.cxx b/src/Container/Container_i.cxx index a12edd4ff..4e0d5fe93 100644 --- a/src/Container/Container_i.cxx +++ b/src/Container/Container_i.cxx @@ -1180,6 +1180,11 @@ void Abstract_Engines_Container_i::set_big_obj_on_disk_directory(const char *dir SALOME::SetBigObjOnDiskDirectory(directory); } +void Abstract_Engines_Container_i::set_number_of_retry(CORBA::Long nbRetry) +{ + SALOME::SetNumberOfRetry( nbRetry ); +} + Engines::vectorOfString_var FromVecStringCppToCORBA( const std::vector& group) { Engines::vectorOfString_var ret( new Engines::vectorOfString ); diff --git a/src/Container/SALOME_Container.py b/src/Container/SALOME_Container.py index bd267d4dc..a7d2f83c5 100644 --- a/src/Container/SALOME_Container.py +++ b/src/Container/SALOME_Container.py @@ -223,3 +223,17 @@ class SALOME_Container_OutOfProcess_Replay_i(SALOME_Container_i): def getPyScriptCls(self): return SALOME_PyNode.PyScriptNode_OutOfProcess_Replay_i + +class SALOME_Container_OutOfProcess_FT_i(SALOME_Container_i): + def __init__(self, containerName, containerIORStr, dftTimeIntervalInMs): + super().__init__(containerName, containerIORStr, dftTimeIntervalInMs) + + def getPyScriptCls(self): + return SALOME_PyNode.PyScriptNode_OutOfProcess_FT_i + +class SALOME_Container_OutOfProcess_Replay_FT_i(SALOME_Container_i): + def __init__(self, containerName, containerIORStr, dftTimeIntervalInMs): + super().__init__(containerName, containerIORStr, dftTimeIntervalInMs) + + def getPyScriptCls(self): + return SALOME_PyNode.PyScriptNode_OutOfProcess_Replay_FT_i diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index be55b1bf3..2ee5639bc 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -246,6 +246,16 @@ void SALOME_ContainerManager::SetBigObjOnDiskDirectory(const char *directory) SALOME::SetBigObjOnDiskDirectory(directory); } + void SALOME_ContainerManager::SetNumberOfRetry(CORBA::Long nbRetry) + { + SALOME::SetNumberOfRetry( nbRetry ); + } + +CORBA::Long SALOME_ContainerManager::GetNumberOfRetry() +{ + return SALOME::GetNumberOfRetry(); +} + //============================================================================= //! Loop on all the containers listed in naming service, ask shutdown on each /*! CORBA Method: @@ -541,6 +551,7 @@ Engines::Container_ptr SALOME_ContainerManager::GiveContainer(const Engines::Con INFOS("[GiveContainer] container " << containerNameInNS << " override " << envInfo.str()); cont->set_big_obj_on_disk_directory( SALOME::GetBigObjOnDiskDirectory().c_str() ); cont->set_big_obj_on_disk_threshold( SALOME::GetBigObjOnDiskThreshold() ); + cont->set_number_of_retry( SALOME::GetNumberOfRetry() ); Engines::FieldsDict envCorba; { auto sz = _override_env.size(); @@ -586,6 +597,10 @@ std::string SALOME_ContainerManager::GetCppBinaryOfKernelSSLContainer() const return "SALOME_Container_No_NS_Serv_OutProcess"; case SALOME::PyExecutionMode::OutOfProcessWithReplay: return "SALOME_Container_No_NS_Serv_OutProcess_Replay"; + case SALOME::PyExecutionMode::OutOfProcessNoReplayFT: + return "SALOME_Container_No_NS_Serv_OutProcess_FT"; + case SALOME::PyExecutionMode::OutOfProcessWithReplayFT: + return "SALOME_Container_No_NS_Serv_OutProcess_Replay_FT"; default: { ERROR_MESSAGE("Not manager py execution mode"); diff --git a/src/Container/SALOME_ContainerManager.hxx b/src/Container/SALOME_ContainerManager.hxx index 6119ec393..a02bd9258 100644 --- a/src/Container/SALOME_ContainerManager.hxx +++ b/src/Container/SALOME_ContainerManager.hxx @@ -83,6 +83,10 @@ public: void SetBigObjOnDiskDirectory(const char *directory) override; + void SetNumberOfRetry(CORBA::Long nbRetry) override; + + CORBA::Long GetNumberOfRetry() override; + static const char *_ContainerManagerNameInNS; private: diff --git a/src/Container/SALOME_Container_No_NS_Serv.cxx b/src/Container/SALOME_Container_No_NS_Serv.cxx index f7df00250..097663605 100644 --- a/src/Container/SALOME_Container_No_NS_Serv.cxx +++ b/src/Container/SALOME_Container_No_NS_Serv.cxx @@ -19,4 +19,7 @@ #include "SALOME_Container_No_NS_Serv_Generic.hxx" -GENERIC_CONTAINER_EXECUTABLE( Engines_Container_SSL_i ) +int main(int argc, char* argv[]) +{ + return GenericContainerExecutable(argc,argv); +} diff --git a/src/Container/SALOME_Container_No_NS_Serv_Generic.hxx b/src/Container/SALOME_Container_No_NS_Serv_Generic.hxx index d5bc1e7f6..bd4932a2f 100644 --- a/src/Container/SALOME_Container_No_NS_Serv_Generic.hxx +++ b/src/Container/SALOME_Container_No_NS_Serv_Generic.hxx @@ -24,20 +24,20 @@ #include "SALOME_KernelORB.hxx" #include "KernelBasis.hxx" -#define GENERIC_CONTAINER_EXECUTABLE( cls ) \ -int main(int argc, char* argv[]) \ -{ \ - if(argc<3) \ - THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : requires 2 input arguments " ); \ - CORBA::ORB_ptr orb(KERNEL::getORB()); \ - std::string IOROfEmbeddedNamingService(argv[2]); \ - setIOROfEmbeddedNS(IOROfEmbeddedNamingService); \ - CORBA::Object_var ns_serv_obj_base = orb->string_to_object(IOROfEmbeddedNamingService.c_str()); \ - if( CORBA::is_nil(ns_serv_obj_base) ) \ - THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : argument 2 is NOT a valid IOR" ); \ - Engines::EmbeddedNamingService_var ns_serv_obj = Engines::EmbeddedNamingService::_narrow(ns_serv_obj_base); \ - if( CORBA::is_nil(ns_serv_obj) ) \ - THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : argument 2 is NOT a valid IOR of Engines::EmbeddedNamingService" ); \ - std::unique_ptr ns( new SALOME_Embedded_NamingService_Client(ns_serv_obj) ); \ - return container_common_main(argc,argv,std::move(ns)); \ +template +int GenericContainerExecutable(int argc, char* argv[]) +{ + if(argc<3) + THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : requires 2 input arguments " ); + CORBA::ORB_ptr orb(KERNEL::getORB()); + std::string IOROfEmbeddedNamingService(argv[2]); + setIOROfEmbeddedNS(IOROfEmbeddedNamingService); + CORBA::Object_var ns_serv_obj_base = orb->string_to_object(IOROfEmbeddedNamingService.c_str()); + if( CORBA::is_nil(ns_serv_obj_base) ) + THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : argument 2 is NOT a valid IOR" ); + Engines::EmbeddedNamingService_var ns_serv_obj = Engines::EmbeddedNamingService::_narrow(ns_serv_obj_base); + if( CORBA::is_nil(ns_serv_obj) ) + THROW_SALOME_EXCEPTION( "SALOME_Container_No_NS_Serv : argument 2 is NOT a valid IOR of Engines::EmbeddedNamingService" ); + std::unique_ptr ns( new SALOME_Embedded_NamingService_Client(ns_serv_obj) ); + return container_common_main(argc,argv,std::move(ns)); } diff --git a/src/Container/SALOME_Container_No_NS_Serv_OutProcess.cxx b/src/Container/SALOME_Container_No_NS_Serv_OutProcess.cxx index f535d073a..c1c02c784 100644 --- a/src/Container/SALOME_Container_No_NS_Serv_OutProcess.cxx +++ b/src/Container/SALOME_Container_No_NS_Serv_OutProcess.cxx @@ -19,4 +19,7 @@ #include "SALOME_Container_No_NS_Serv_Generic.hxx" -GENERIC_CONTAINER_EXECUTABLE( Engines_Container_SSL_OutOfProcess_i ) +int main(int argc, char* argv[]) +{ + return GenericContainerExecutable(argc,argv); +} diff --git a/src/Container/SALOME_Container_No_NS_Serv_OutProcess_FT.cxx b/src/Container/SALOME_Container_No_NS_Serv_OutProcess_FT.cxx new file mode 100644 index 000000000..fdc3d9669 --- /dev/null +++ b/src/Container/SALOME_Container_No_NS_Serv_OutProcess_FT.cxx @@ -0,0 +1,25 @@ +// Copyright (C) 2021-2024 CEA, EDF +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// + +#include "SALOME_Container_No_NS_Serv_Generic.hxx" + +int main(int argc, char* argv[]) +{ + return GenericContainerExecutable(argc,argv); +} diff --git a/src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay.cxx b/src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay.cxx index 27a90f58e..be198b252 100644 --- a/src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay.cxx +++ b/src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay.cxx @@ -19,4 +19,7 @@ #include "SALOME_Container_No_NS_Serv_Generic.hxx" -GENERIC_CONTAINER_EXECUTABLE( Engines_Container_SSL_OutOfProcess_Replay_i ) +int main(int argc, char* argv[]) +{ + return GenericContainerExecutable(argc,argv); +} diff --git a/src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay_FT.cxx b/src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay_FT.cxx new file mode 100644 index 000000000..7568dea30 --- /dev/null +++ b/src/Container/SALOME_Container_No_NS_Serv_OutProcess_Replay_FT.cxx @@ -0,0 +1,25 @@ +// Copyright (C) 2021-2024 CEA, EDF +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// + +#include "SALOME_Container_No_NS_Serv_Generic.hxx" + +int main(int argc, char* argv[]) +{ + return GenericContainerExecutable(argc,argv); +} diff --git a/src/Container/SALOME_Container_i.hxx b/src/Container/SALOME_Container_i.hxx index d929939f0..08c174254 100644 --- a/src/Container/SALOME_Container_i.hxx +++ b/src/Container/SALOME_Container_i.hxx @@ -86,6 +86,8 @@ public: void set_big_obj_on_disk_directory(const char *directory) override; + void set_number_of_retry(CORBA::Long nbRetry) override; + void addLogFileNameGroup(const Engines::vectorOfString& groupOfLogFileNames) override; Engines::vectorOfVectorOfString *getAllLogFileNameGroups() override; @@ -223,7 +225,8 @@ protected: constexpr char PY_CONTAINER_CLS_NAME_IN_PROCESS[] = "SALOME_Container_i"; constexpr char PY_CONTAINER_CLS_NAME_OUT_PROCESS_NO_REPLAY[] = "SALOME_Container_OutOfProcess_i"; constexpr char PY_CONTAINER_CLS_NAME_OUT_PROCESS_WITH_REPLAY[] = "SALOME_Container_OutOfProcess_Replay_i"; - +constexpr char PY_CONTAINER_CLS_NAME_OUT_PROCESS_NO_REPLAY_FT[] = "SALOME_Container_OutOfProcess_FT_i"; +constexpr char PY_CONTAINER_CLS_NAME_OUT_PROCESS_WITH_REPLAY_FT[] = "SALOME_Container_OutOfProcess_Replay_FT_i"; class CONTAINER_EXPORT Engines_Container_i : public Abstract_Engines_Container_i { @@ -289,6 +292,30 @@ public: Abstract_Engines_Container_SSL_i(PY_CONTAINER_CLS_NAME_OUT_PROCESS_WITH_REPLAY, orb, poa, containerName, argc, argv, ns, isServantAloneInProcess) {} }; +class CONTAINER_EXPORT Engines_Container_SSL_OutOfProcess_FT_i : public Abstract_Engines_Container_SSL_i +{ +public: + Engines_Container_SSL_OutOfProcess_FT_i(CORBA::ORB_ptr orb, + PortableServer::POA_ptr poa, + char *containerName, + int argc, char *argv[], + SALOME_NamingService_Container_Abstract *ns = nullptr, + bool isServantAloneInProcess = true) : + Abstract_Engines_Container_SSL_i(PY_CONTAINER_CLS_NAME_OUT_PROCESS_NO_REPLAY_FT, orb, poa, containerName, argc, argv, ns, isServantAloneInProcess) {} +}; + +class CONTAINER_EXPORT Engines_Container_SSL_OutOfProcess_Replay_FT_i : public Abstract_Engines_Container_SSL_i +{ +public: + Engines_Container_SSL_OutOfProcess_Replay_FT_i(CORBA::ORB_ptr orb, + PortableServer::POA_ptr poa, + char *containerName, + int argc, char *argv[], + SALOME_NamingService_Container_Abstract *ns = nullptr, + bool isServantAloneInProcess = true) : + Abstract_Engines_Container_SSL_i(PY_CONTAINER_CLS_NAME_OUT_PROCESS_WITH_REPLAY_FT, orb, poa, containerName, argc, argv, ns, isServantAloneInProcess) {} +}; + /*! * Methods to be used in SSL mode to skip NS. */ diff --git a/src/Container/SALOME_PyNode.py b/src/Container/SALOME_PyNode.py index 312c7bf5d..0b0c9b3cd 100644 --- a/src/Container/SALOME_PyNode.py +++ b/src/Container/SALOME_PyNode.py @@ -40,6 +40,8 @@ MY_CONTAINER_ENTRY_IN_GLBS = "my_container" MY_PERFORMANCE_LOG_ENTRY_IN_GLBS = "my_log_4_this_session" +MY_KEY_TO_DETECT_FINISH = "neib av tuot" + class Generic(SALOME__POA.GenericObj): """A Python implementation of the GenericObj CORBA IDL""" def __init__(self,poa): @@ -560,6 +562,9 @@ class GenericPythonMonitoringLauncherCtxMgr: def __exit__(self,exctype, exc, tb): StopMonitoring( self._monitoring_params ) + del self._monitoring_params + import gc + gc.collect() # force destruction of objects even in raise context def StopMonitoring( monitoringInfo ): """ @@ -730,6 +735,9 @@ with open(inputFileName,"rb") as f: context[MY_PERFORMANCE_LOG_ENTRY_IN_GLBS] = eval( MY_PERFORMANCE_LOG_ENTRY_IN_GLBS ) with open(codeFileName,"r") as f: code = f.read() +# +import gc +gc.disable() # go for execution exec( code , context ) # filter part of context to be exported to father process @@ -792,7 +800,7 @@ Looks like a hard crash as returnCode {returnCode} != 0 {banner} """ -def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, keepFilesToReplay ): +def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, keepFilesToReplay, closeEyesOnErrorAtExit): """ Equivalent of exec(code,context) but executed in a separate subprocess to avoid to make the current process crash. @@ -805,6 +813,7 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL containerRef (Engines.Container) : Container ref (retrieving the Files to created when keepFilesToReplay is set to False) instanceOfLogOfCurrentSession (LogOfCurrentExecutionSession) : instance of LogOfCurrentExecutionSession to build remotely the reference in order to log information keepFilesToReplay (bool) : if True when something goes wrong during execution all the files to replay post mortem case are kept. If False only error is reported but files to replay are destoyed. + closeEyesOnErrorAtExit (bool) : if True in case of crash of subprocess, if MY_KEY_TO_DETECT_FINISH is displayed at the end of stdout Return: ------- @@ -820,8 +829,22 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL import pickle import subprocess as sp import CORBA + # + def IsConsideredAsOKRun( returnCode, closeEyesOnErrorAtExit , stderr ): + def StdErrTreatment(closeEyesOnErrorAtExit , stderr): + if not closeEyesOnErrorAtExit: + return stderr + else: + return stderr[:-len(MY_KEY_TO_DETECT_FINISH)] + if returnCode == 0: + return True,StdErrTreatment(closeEyesOnErrorAtExit , stderr) + if not closeEyesOnErrorAtExit: + return False, stderr + return stderr[-len(MY_KEY_TO_DETECT_FINISH):] == MY_KEY_TO_DETECT_FINISH,stderr[:-len(MY_KEY_TO_DETECT_FINISH)] + # def InternalExecResistant( code, context, outargsname): + import KernelBasis orb = CORBA.ORB_init(['']) iorScriptLog = orb.object_to_string( instanceOfLogOfCurrentSession._remote_handle )#ref ContainerScriptPerfLog_ptr #### @@ -830,6 +853,11 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL return os.path.splitext( os.path.basename(fname)[len(EXEC_CODE_FNAME_PXF):] )[0] with tempfile.NamedTemporaryFile(dir=os.getcwd(),prefix=EXEC_CODE_FNAME_PXF,suffix=".py", mode="w", delete = False) as codeFd: codeFd.write( code ) + if closeEyesOnErrorAtExit: + codeFd.write( """ +import sys +sys.stderr.write({!r}) +sys.stderr.flush()""".format( MY_KEY_TO_DETECT_FINISH ) ) codeFd.flush() codeFileName = os.path.basename( codeFd.name ) contextFileName = "contextsafe_{}.pckl".format( RetrieveUniquePartFromPfx( codeFileName ) ) @@ -839,24 +867,32 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL mainExecFileName = os.path.abspath( "mainexecsafe_{}.py".format( RetrieveUniquePartFromPfx( codeFileName ) ) ) with open(mainExecFileName,"w") as f: f.write( FinalCode.format( codeFileName, contextFileName, resFileName, outargsname, iorScriptLog ) ) - p = sp.Popen(["python3", mainExecFileName],stdout = sp.PIPE, stderr = sp.PIPE) - stdout, stderr = p.communicate() - returnCode = p.returncode + for iTry in range( KernelBasis.GetNumberOfRetry() ): + if iTry > 0: + print( "WARNING : Retry # {}. Following code has generated non zero return code ( {} ). Trying again ... \n{}".format( iTry, returnCode, code ) ) + p = sp.Popen(["python3", mainExecFileName],stdout = sp.PIPE, stderr = sp.PIPE) + stdout, stderr = p.communicate() + returnCode = p.returncode + if returnCode == 0: + break return returnCode, stdout, stderr, PythonFunctionEvaluatorParams(mainExecFileName,codeFileName,contextFileName,resFileName) ret = instanceOfLogOfCurrentSession._current_instance returnCode, stdout, stderr, evParams = InternalExecResistant( code, context, outargsname ) stdout = stdout.decode() stderr = stderr.decode() sys.stdout.write( stdout ) ; sys.stdout.flush() + isOK, stderr = IsConsideredAsOKRun( returnCode, closeEyesOnErrorAtExit , stderr ) sys.stderr.write( stderr ) ; sys.stderr.flush() - if returnCode == 0: + if isOK: pcklData = instanceOfLogOfCurrentSession._remote_handle.getObj() if len(pcklData) > 0: ret = pickle.loads( pcklData ) context.update( evParams.result ) evParams.destroyOnOK() + if returnCode != 0: + print( "WARNING : Following code has generated non zero return code ( {} ) but considered as OK\n{}".format( returnCode, code ) ) return ret - if returnCode != 0: + else: if keepFilesToReplay: evParams.destroyOnKO( containerRef ) else: @@ -864,10 +900,16 @@ def ExecCrashProofGeneric( code, context, outargsname, containerRef, instanceOfL raise RuntimeError(f"Subprocess launched {evParams.strDependingOnReturnCode(keepFilesToReplay,returnCode)}stdout :\n{stdout}\nstderr :\n{stderr}") def ExecCrashProofWithReplay( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ): - return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, True) + return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, True, False) def ExecCrashProofWithoutReplay( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ): - return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, False) + return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, False, False) + +def ExecCrashProofWithReplayFT( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ): + return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, True, True) + +def ExecCrashProofWithoutReplayFT( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ): + return ExecCrashProofGeneric(code, context, outargsname, containerRef, instanceOfLogOfCurrentSession, False, True) def ExecLocal( code, context, outargsname, containerRef, instanceOfLogOfCurrentSession ): exec( code, context ) @@ -894,7 +936,13 @@ class LogOfCurrentExecutionSession(LogOfCurrentExecutionSessionAbs): self.finalizeAndPushToMaster() def finalizeAndPushToMaster(self): - self._remote_handle.assign( pickle.dumps( self._current_instance ) ) + """ + Voluntary do nothing in case of problem to avoid to trouble execution + """ + try: + self._remote_handle.assign( pickle.dumps( self._current_instance ) ) + except: + pass class LogOfCurrentExecutionSessionStub(LogOfCurrentExecutionSessionAbs): """ @@ -1019,17 +1067,20 @@ class PyScriptNode_Abstract_i(Engines__POA.PyScriptNode,Generic,abc.ABC): def executeSecond(self,outargsname): """ Same than second part of self.execute to reduce memory peak.""" + def executeSecondInternal(monitoringtimeresms): + with GenericPythonMonitoringLauncherCtxMgr( CPUMemoryMonitoring( monitoringtimeresms ) ) as monitoringParams: + currentInstance = self.executeNow( outargsname ) + cpumeminfo = ReadCPUMemInfo( monitoringParams ) + return cpumeminfo, currentInstance + import sys try: self.addTimeInfoOnLevel2("startExecTime") ## self.addInfoOnLevel2("measureTimeResolution",self.my_container_py.monitoringtimeresms()) - with GenericPythonMonitoringLauncherCtxMgr( CPUMemoryMonitoring( self.my_container_py.monitoringtimeresms() ) ) as monitoringParams: - self._current_execution_session._current_instance = self.executeNow( outargsname ) - cpumeminfo = ReadCPUMemInfo( monitoringParams ) + cpumeminfo, self._current_execution_session._current_instance = executeSecondInternal( self.my_container_py.monitoringtimeresms() ) ## self.addInfoOnLevel2("CPUMemDuringExec",cpumeminfo) - del monitoringParams self.addTimeInfoOnLevel2("endExecTime") self.addTimeInfoOnLevel2("startOutputTime") argsout=[] @@ -1130,3 +1181,17 @@ class PyScriptNode_OutOfProcess_Replay_i(PyScriptNode_Abstract_i): def executeNow(self, outargsname): return ExecCrashProofWithReplay(self.code,self.context,outargsname,self.my_container,self._current_execution_session) + +class PyScriptNode_OutOfProcess_FT_i(PyScriptNode_Abstract_i): + def __init__(self, nodeName, code, poa, my_container, logscript): + super().__init__(nodeName, code, poa, my_container, logscript) + + def executeNow(self, outargsname): + return ExecCrashProofWithoutReplayFT(self.code,self.context,outargsname,self.my_container,self._current_execution_session) + +class PyScriptNode_OutOfProcess_Replay_FT_i(PyScriptNode_Abstract_i): + def __init__(self, nodeName, code, poa, my_container, logscript): + super().__init__(nodeName, code, poa, my_container, logscript) + + def executeNow(self, outargsname): + return ExecCrashProofWithReplayFT(self.code,self.context,outargsname,self.my_container,self._current_execution_session) diff --git a/src/Launcher/Test/testCrashProofContainer.py b/src/Launcher/Test/testCrashProofContainer.py index e8b7a9730..0d2a168fe 100644 --- a/src/Launcher/Test/testCrashProofContainer.py +++ b/src/Launcher/Test/testCrashProofContainer.py @@ -37,6 +37,7 @@ import subprocess as sp killMeCode = """ import os import sys +import signal j = 7 * i sys.stdout.write(str(j)) ; sys.stdout.flush() # the aime of test in replay mode to be sure that case is runnable os.kill( os.getpid() , signal.SIGKILL)# the aim of test is here @@ -53,6 +54,19 @@ cst = KernelBasis.GetTimeAdjustmentCst() KernelBasis.HeatMarcel(5 * nbcore * cst,nbcore) j = 8*i""" +killMeAtTheEnd = """import atexit +import KernelServices + +def ErrorAtexit(): + KernelServices.GenerateViolentMemoryFaultForTestPurpose() + +atexit.register(ErrorAtexit) + +print("OKKKKKK") +j = 9 * i +print("OKKKKKK3333") +""" + class testPerfLogManager1(unittest.TestCase): def test0(self): """ @@ -114,7 +128,7 @@ class testPerfLogManager1(unittest.TestCase): # now try to replay the failing case p = sp.Popen(["python3",os.path.basename(replayInput[0])],cwd = os.path.dirname(replayInput[0]),stdout=sp.PIPE,stderr=sp.PIPE) out,err = p.communicate() - self.assertEqual(1,p.returncode) # very important ! The failing case must continue to fail :) + self.assertNotEqual(p.returncode,0) # very important ! The failing case must continue to fail :) self.assertEqual("21".encode(),out) # very important to check that the reported case is standalone enough to be replayable poste mortem # cleanup dn = os.path.dirname(replayInput[0]) @@ -156,6 +170,31 @@ class testPerfLogManager1(unittest.TestCase): self.assertGreater(len(greater_than_100),1) # At minimum one measure must report CPU load > 100% cont.Shutdown() + def test3(self): + """ + [EDF29150] : test that we can resist to a crash at exit + """ + salome.salome_init() + KernelBasis.SetPyExecutionMode("OutOfProcessWithReplayFT") + hostname = "localhost" + cp = pylauncher.GetRequestForGiveContainer(hostname,"container_crash_test") + salome.cm.SetNumberOfRetry( 3 ) + salome.cm.SetBigObjOnDiskThreshold(1000) + salome.cm.SetOverrideEnvForContainersSimple(env = []) + cont = salome.cm.GiveContainer(cp) + poa = salome.orb.resolve_initial_references("RootPOA") + obj = SALOME_PyNode.SenderByte_i(poa,pickle.dumps( (["i"],{"i": 3} ) )) ; id_o = poa.activate_object(obj) ; refPtr = poa.id_to_reference(id_o) + pyscript = cont.createPyScriptNode("testScript4",killMeAtTheEnd) + pyscript.executeFirst(refPtr) + ret = pyscript.executeSecond(["j"]) + ret = pickle.loads( SALOME_PyNode.SeqByteReceiver(ret[0]).data() ) + self.assertEqual(ret,27) + with open(cont.locallogfilename) as f: + logCont = f.read( ) + self.assertTrue( "WARNING : Retry #" in logCont) + self.assertTrue( "WARNING : Following code has generated non zero return code" in logCont )# should report something into the container + cont.Shutdown() + if __name__ == '__main__': from salome_utils import positionVerbosityOfLoggerRegardingState,setVerboseLevel,setVerbose salome.standalone() -- 2.39.2