From 35152f13894fd72c54b00360adbcd1e0e380f06b Mon Sep 17 00:00:00 2001 From: prascle Date: Thu, 19 Feb 2009 13:20:45 +0000 Subject: [PATCH] merge from BR_V51_BS 19/02/09 --- salome_adm/unix/config_files/check_openmpi.m4 | 6 + src/Container/Component_i.cxx | 58 +-- src/Container/Container_i.cxx | 7 +- src/Container/SALOME_Component_i.hxx | 45 +-- src/Container/SALOME_ContainerManager.cxx | 352 +++++++++--------- src/Container/SALOME_ContainerManager.hxx | 19 +- src/MPIContainer/MPIContainer_i.cxx | 118 +++--- src/MPIContainer/MPIObject_i.cxx | 82 +++- src/MPIContainer/MPIObject_i.hxx | 25 +- src/MPIContainer/SALOME_MPIContainer.cxx | 8 +- 10 files changed, 382 insertions(+), 338 deletions(-) diff --git a/salome_adm/unix/config_files/check_openmpi.m4 b/salome_adm/unix/config_files/check_openmpi.m4 index cbbe91f4d..fe5957a2c 100644 --- a/salome_adm/unix/config_files/check_openmpi.m4 +++ b/salome_adm/unix/config_files/check_openmpi.m4 @@ -42,9 +42,15 @@ if test "$WITHOPENMPI" = yes; then AC_CHECK_HEADER(mpi.h,WITHOPENMPI="yes",WITHOPENMPI="no") CPPFLAGS="$CPPFLAGS_old" + LIBS_old="$LIBS" + LIBS="-L${OPENMPI_HOME}/lib $LIBS" + AC_CHECK_LIB(mpi,MPI_Publish_name,WITHMPI2="yes",WITHMPI2="no") + LIBS="$LIBS_old" + AC_MSG_CHECKING(for openmpi) if test "$WITHOPENMPI" = "yes";then mpi_ok=yes + CPPFLAGS="-DWITHOPENMPI $CPPFLAGS" AC_MSG_RESULT(yes) else mpi_ok=no diff --git a/src/Container/Component_i.cxx b/src/Container/Component_i.cxx index aa0922248..eca25186b 100644 --- a/src/Container/Component_i.cxx +++ b/src/Container/Component_i.cxx @@ -83,6 +83,7 @@ Engines_Component_i::Engines_Component_i():_myConnexionToRegistry(0), _notifSupp * \param instanceName unique instance name for this object (see Container_i) * \param interfaceName component class name * \param notif use of notification + * \param regist (true or false) use of registry (default true) */ //============================================================================= @@ -91,7 +92,8 @@ Engines_Component_i::Engines_Component_i(CORBA::ORB_ptr orb, PortableServer::ObjectId * contId, const char *instanceName, const char *interfaceName, - bool notif) : + bool notif, + bool regist ) : _instanceName(instanceName), _interfaceName(interfaceName), _myConnexionToRegistry(0), @@ -105,18 +107,20 @@ Engines_Component_i::Engines_Component_i(CORBA::ORB_ptr orb, _CanceledThread(false) { MESSAGE("Component constructor with instanceName "<< _instanceName); - //SCRUTE(pd_refCount); _orb = CORBA::ORB::_duplicate(orb); _poa = PortableServer::POA::_duplicate(poa); _contId = contId ; CORBA::Object_var o = _poa->id_to_reference(*contId); // container ior... _container=Engines::Container::_narrow(o); - const CORBA::String_var ior = _orb->object_to_string(o); - _myConnexionToRegistry = new RegistryConnexion(0, 0, ior,"theSession", - _instanceName.c_str()); + + if(regist) + { + const CORBA::String_var ior = _orb->object_to_string(o); + _myConnexionToRegistry = new RegistryConnexion(0, 0, ior,"theSession", + _instanceName.c_str()); + } _notifSupplier = new NOTIFICATION_Supplier(instanceName, notif); - //SCRUTE(pd_refCount); } //============================================================================= @@ -172,48 +176,6 @@ Engines_Component_i::Engines_Component_i(CORBA::ORB_ptr orb, } -//============================================================================= -/*! \brief Standard constructor for parallel component - * - * Connection Notification (no connection to Registry !) - * \param orb Object Request broker given by Container - * \param poa Portable Object Adapter from Container (normally root_poa) - * \param contId container CORBA id inside the server - * \param instanceName unique instance name for this object (see Container_i) - * \param interfaceName component class name - * \param flag not used... - * \param notif use of notification - */ -//============================================================================= - -Engines_Component_i::Engines_Component_i(CORBA::ORB_ptr orb, - PortableServer::POA_ptr poa, - PortableServer::ObjectId * contId, - const char *instanceName, - const char *interfaceName, - int flag, - bool notif ) : - _instanceName(instanceName), - _interfaceName(interfaceName), - _myConnexionToRegistry(0), - _notifSupplier(0), - _ThreadId(0) , - _ThreadCpuUsed(0) , - _Executed(false) , - _graphName("") , - _nodeName(""), - _studyId(-1), - _CanceledThread(false) -{ - _orb = CORBA::ORB::_duplicate(orb); - _poa = PortableServer::POA::_duplicate(poa); - _contId = contId ; - CORBA::Object_var o = _poa->id_to_reference(*contId); // container ior... - _container=Engines::Container::_narrow(o); - - _notifSupplier = new NOTIFICATION_Supplier(instanceName, notif); -} - //============================================================================= /*! * Destructor: call Container for decrement of instances count. diff --git a/src/Container/Container_i.cxx b/src/Container/Container_i.cxx index 28c58e407..7558ef37e 100644 --- a/src/Container/Container_i.cxx +++ b/src/Container/Container_i.cxx @@ -96,7 +96,7 @@ omni_mutex Engines_Container_i::_numInstanceMutex ; //============================================================================= Engines_Container_i::Engines_Container_i () : -_numInstance(0) +_numInstance(0),_id(0),_NS(0) { } @@ -113,7 +113,7 @@ Engines_Container_i::Engines_Container_i (CORBA::ORB_ptr orb, bool activAndRegist, bool isServantAloneInProcess ) : -_numInstance(0),_isServantAloneInProcess(isServantAloneInProcess) + _numInstance(0),_isServantAloneInProcess(isServantAloneInProcess),_id(0),_NS(0) { _pid = (long)getpid(); @@ -229,7 +229,8 @@ _numInstance(0),_isServantAloneInProcess(isServantAloneInProcess) Engines_Container_i::~Engines_Container_i() { MESSAGE("Container_i::~Container_i()"); - delete _id; + if(_id) + delete _id; if(_NS) delete _NS; } diff --git a/src/Container/SALOME_Component_i.hxx b/src/Container/SALOME_Component_i.hxx index ccf8ddff2..361b87eb3 100644 --- a/src/Container/SALOME_Component_i.hxx +++ b/src/Container/SALOME_Component_i.hxx @@ -59,7 +59,8 @@ public: PortableServer::ObjectId * contId, const char *instanceName, const char *interfaceName, - bool notif = false); + bool notif = false, + bool regist = true); //Constructor for standalone component Engines_Component_i(CORBA::ORB_ptr orb, PortableServer::POA_ptr poa, @@ -67,15 +68,7 @@ public: const char *instanceName, const char *interfaceName, bool notif = false, - bool regist=true); - // Constructor for parallel component : don't call registry - Engines_Component_i(CORBA::ORB_ptr orb, - PortableServer::POA_ptr poa, - PortableServer::ObjectId * contId, - const char *instanceName, - const char *interfaceName, - int flag, - bool notif = false); + bool regist = true); virtual ~Engines_Component_i(); @@ -100,23 +93,23 @@ public: bool Resume_impl(); CORBA::Long CpuUsed_impl() ; - virtual Engines::TMPFile* DumpPython(CORBA::Object_ptr theStudy, - CORBA::Boolean isPublished, - CORBA::Boolean& isValidScript); + virtual Engines::TMPFile* DumpPython(CORBA::Object_ptr theStudy, + CORBA::Boolean isPublished, + CORBA::Boolean& isValidScript); - // CORBA operations for Salome_file - virtual Engines::Salome_file_ptr getInputFileToService(const char* service_name, - const char* Salome_file_name); - virtual Engines::Salome_file_ptr getOutputFileToService(const char* service_name, - const char* Salome_file_name); + // CORBA operations for Salome_file + virtual Engines::Salome_file_ptr getInputFileToService(const char* service_name, + const char* Salome_file_name); + virtual Engines::Salome_file_ptr getOutputFileToService(const char* service_name, + const char* Salome_file_name); - virtual void checkInputFilesToService(const char* service_name); - virtual Engines::Salome_file_ptr setInputFileToService(const char* service_name, - const char* Salome_file_name); + virtual void checkInputFilesToService(const char* service_name); + virtual Engines::Salome_file_ptr setInputFileToService(const char* service_name, + const char* Salome_file_name); - virtual void checkOutputFilesToService(const char* service_name); - virtual Engines::Salome_file_ptr setOutputFileToService(const char* service_name, - const char* Salome_file_name); + virtual void checkOutputFilesToService(const char* service_name); + virtual Engines::Salome_file_ptr setOutputFileToService(const char* service_name, + const char* Salome_file_name); // Object information virtual bool hasObjectInfo() { return false; } @@ -143,8 +136,8 @@ public: void CancelThread() ; virtual void configureSalome_file(std::string service_name, - std::string file_port_name, - Salome_file_i * file); + std::string file_port_name, + Salome_file_i * file); protected: diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index c1bf5dc17..d00618fd2 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -62,7 +62,6 @@ SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableSer MESSAGE("constructor"); _NS = ns; _ResManager = rm; - _id=0; PortableServer::POAManager_var pman = poa->the_POAManager(); _orb = CORBA::ORB::_duplicate(orb) ; @@ -177,7 +176,7 @@ void SALOME_ContainerManager::ShutdownContainers() } //============================================================================= - //! Find a suitable Container in a list of machines, or start one +//! Find a suitable Container in a list of machines, or start one /*! CORBA Method: * \param params Machine Parameters required for the container * \param possibleComputers list of machines usable for find or start @@ -220,9 +219,7 @@ StartContainer(const Engines::MachineParameters& params, if (parallelLib != "") return FindOrStartParallelContainer(params, possibleComputers); #endif - long id; string containerNameInNS; - char idc[3*sizeof(long)]; Engines::Container_ptr ret = Engines::Container::_nil(); MESSAGE("SALOME_ContainerManager::StartContainer " << @@ -257,15 +254,6 @@ StartContainer(const Engines::MachineParameters& params, MESSAGE("try to launch it on " << theMachine); - // Get Id for container: a parallel container registers in Naming Service - // on the machine where is process 0. ContainerManager does'nt know the name - // of this machine before the launch of the parallel container. So to get - // the IOR of the parallel container in Naming Service, ContainerManager - // gives a unique Id. The parallel container registers his name under - // /ContainerManager/Id directory in NamingService - - id = GetIdForContainer(); - string command; if(theMachine==""){ MESSAGE("SALOME_ContainerManager::StartContainer : " << @@ -273,17 +261,14 @@ StartContainer(const Engines::MachineParameters& params, return Engines::Container::_nil(); } else if(theMachine==Kernel_Utils::GetHostname()) - command = BuildCommandToLaunchLocalContainer(params,id,container_exe); + command = BuildCommandToLaunchLocalContainer(params,container_exe); else - command = BuildCommandToLaunchRemoteContainer(theMachine,params,id,container_exe); + command = BuildCommandToLaunchRemoteContainer(theMachine,params,container_exe); //check if an entry exists in Naming service if(params.isMPI) - { - containerNameInNS = "/ContainerManager/id"; - sprintf(idc,"%ld",id); - containerNameInNS += idc; - } + // A parallel container register on zero node in NS + containerNameInNS = _NS->BuildContainerNameForNS(params,GetMPIZeroNode(theMachine).c_str()); else containerNameInNS = _NS->BuildContainerNameForNS(params,theMachine.c_str()); @@ -315,13 +300,13 @@ StartContainer(const Engines::MachineParameters& params, if (status == -1){ MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed " << "(system command status -1)"); - RmTmpFile(); // command file can be removed here + RmTmpFile(_TmpFileName); // command file can be removed here return Engines::Container::_nil(); } else if (status == 217){ MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed " << "(system command status 217)"); - RmTmpFile(); // command file can be removed here + RmTmpFile(_TmpFileName); // command file can be removed here return Engines::Container::_nil(); } else{ @@ -353,7 +338,7 @@ StartContainer(const Engines::MachineParameters& params, ret->logfilename(logFilename.c_str()); } - RmTmpFile(); // command file can be removed here + RmTmpFile(_TmpFileName); // command file can be removed here return ret; } } @@ -518,33 +503,33 @@ FindOrStartParallelContainer(const Engines::MachineParameters& params_const, proxy->start(); } catch(CORBA::SystemException& e) - { - INFOS("Caught CORBA::SystemException. : " << e); - } + { + INFOS("Caught CORBA::SystemException. : " << e); + } catch(PortableServer::POA::ServantAlreadyActive&) - { - INFOS("Caught CORBA::ServantAlreadyActiveException"); - } + { + INFOS("Caught CORBA::ServantAlreadyActiveException"); + } catch(CORBA::Exception&) - { - INFOS("Caught CORBA::Exception."); - } + { + INFOS("Caught CORBA::Exception."); + } catch(std::exception& exc) - { - INFOS("Caught std::exception - "<ContainerName(params); - command += " -id "; - sprintf(idc,"%ld",id); - command += idc; command += " -"; AddOmninamesParams(command); @@ -917,12 +887,11 @@ SALOME_ContainerManager::BuildCommandToLaunchRemoteContainer string SALOME_ContainerManager::BuildCommandToLaunchLocalContainer -(const Engines::MachineParameters& params, const long id,const std::string& container_exe) +(const Engines::MachineParameters& params, const std::string& container_exe) { _TmpFileName = BuildTemporaryFileName(); string command; int nbproc = 0; - //char idc[3*sizeof(long)]; ofstream command_file( _TmpFileName.c_str() ); @@ -949,14 +918,23 @@ SALOME_ContainerManager::BuildCommandToLaunchLocalContainer #ifdef WITHLAM //command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; command_file << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#elif defined(WITHOPENMPI) + //command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace "; + if( getenv("OMPI_URI_FILE") == NULL ) + command_file << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace"; + else + { + command_file << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:"; + command_file << getenv("OMPI_URI_FILE"); + } #endif if (isPythonContainer(params.container_name)) //command += "pyMPI SALOME_ContainerPy.py "; - command_file << "pyMPI SALOME_ContainerPy.py "; + command_file << " pyMPI SALOME_ContainerPy.py "; else //command += "SALOME_MPIContainer "; - command_file << "SALOME_MPIContainer "; + command_file << " SALOME_MPIContainer "; } else @@ -1001,16 +979,8 @@ SALOME_ContainerManager::BuildCommandToLaunchLocalContainer } - - /*command += _NS->ContainerName(params); - command += " -id "; - sprintf(idc,"%ld",id); - command += idc; - command += " -"; - AddOmninamesParams(command);*/ - command_file << _NS->ContainerName(params); - command_file << " -id " << id << " -"; + command_file << " -"; AddOmninamesParams(command_file); command_file.close(); @@ -1030,9 +1000,9 @@ SALOME_ContainerManager::BuildCommandToLaunchLocalContainer */ //============================================================================= -void SALOME_ContainerManager::RmTmpFile() +void SALOME_ContainerManager::RmTmpFile(std::string& tmpFileName) { - int lenght = _TmpFileName.size(); + int lenght = tmpFileName.size(); if ( lenght > 0) { #ifdef WIN32 @@ -1041,13 +1011,13 @@ void SALOME_ContainerManager::RmTmpFile() string command = "rm "; #endif if ( lenght > 4 ) - command += _TmpFileName.substr(0, lenght - 3 ); + command += tmpFileName.substr(0, lenght - 3 ); else - command += _TmpFileName; + command += tmpFileName; command += '*'; system(command.c_str()); //if dir is empty - remove it - string tmp_dir = Kernel_Utils::GetDirByPath( _TmpFileName ); + string tmp_dir = Kernel_Utils::GetDirByPath( tmpFileName ); if ( Kernel_Utils::IsEmptyDir( tmp_dir ) ) { #ifdef WIN32 @@ -1067,11 +1037,11 @@ void SALOME_ContainerManager::RmTmpFile() //============================================================================= void SALOME_ContainerManager::AddOmninamesParams(string& command) const - { - CORBA::String_var iorstr = _NS->getIORaddr(); - command += "ORBInitRef NameService="; - command += iorstr; - } +{ + CORBA::String_var iorstr = _NS->getIORaddr(); + command += "ORBInitRef NameService="; + command += iorstr; +} //============================================================================= @@ -1081,11 +1051,11 @@ void SALOME_ContainerManager::AddOmninamesParams(string& command) const //============================================================================= void SALOME_ContainerManager::AddOmninamesParams(ofstream& fileStream) const - { - CORBA::String_var iorstr = _NS->getIORaddr(); - fileStream << "ORBInitRef NameService="; - fileStream << iorstr; - } +{ + CORBA::String_var iorstr = _NS->getIORaddr(); + fileStream << "ORBInitRef NameService="; + fileStream << iorstr; +} //============================================================================= /*! @@ -1094,16 +1064,16 @@ void SALOME_ContainerManager::AddOmninamesParams(ofstream& fileStream) const //============================================================================= string SALOME_ContainerManager::BuildTemporaryFileName() const - { - //build more complex file name to support multiple salome session - string aFileName = Kernel_Utils::GetTmpFileName(); +{ + //build more complex file name to support multiple salome session + string aFileName = Kernel_Utils::GetTmpFileName(); #ifndef WIN32 - aFileName += ".sh"; + aFileName += ".sh"; #else - aFileName += ".bat"; + aFileName += ".bat"; #endif - return aFileName; - } + return aFileName; +} //============================================================================= @@ -1155,6 +1125,13 @@ SALOME_ContainerManager::BuildTempFileToLaunchRemoteContainer tempOutputFile << nbproc << " "; #ifdef WITHLAM tempOutputFile << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#elif defined(WITHOPENMPI) + if( getenv("OMPI_URI_FILE") == NULL ) + tempOutputFile << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace"; + else{ + tempOutputFile << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:"; + tempOutputFile << getenv("OMPI_URI_FILE"); + } #endif } @@ -1163,9 +1140,9 @@ SALOME_ContainerManager::BuildTempFileToLaunchRemoteContainer if (params.isMPI) { if (isPythonContainer(params.container_name)) - tempOutputFile << "pyMPI SALOME_ContainerPy.py "; + tempOutputFile << " pyMPI SALOME_ContainerPy.py "; else - tempOutputFile << "SALOME_MPIContainer "; + tempOutputFile << " SALOME_MPIContainer "; } else @@ -1264,90 +1241,90 @@ SALOME_ContainerManager::BuildCommandToLaunchLocalParallelContainer(const std::s string real_exe_name = exe_name + parallelLib; if (parallelLib == "Dummy") - { - //command = "gdb --args "; - //command = "valgrind --tool=memcheck --log-file=val_log "; - //command += real_exe_name; - - command = real_exe_name; - - command += " " + _NS->ContainerName(rtn); - command += " " + parallelLib; - command += " " + hostname; - command += " -"; - AddOmninamesParams(command); - } - - else if (parallelLib == "Mpi") - { - // Step 1 : check if MPI is started - if (_MpiStarted == false) { - startMPI(); - } + //command = "gdb --args "; + //command = "valgrind --tool=memcheck --log-file=val_log "; + //command += real_exe_name; - if (par < 0) - { - // Nodes case + command = real_exe_name; - command = "mpiexec -np " + string(buffer) + " "; -// command += "gdb --args "; - command += real_exe_name; command += " " + _NS->ContainerName(rtn); command += " " + parallelLib; command += " " + hostname; command += " -"; AddOmninamesParams(command); } - else + + else if (parallelLib == "Mpi") { - // Proxy case - command = "mpiexec -np 1 "; - command += real_exe_name; - command += " " + _NS->ContainerName(rtn); - command += " " + string(buffer); - command += " " + parallelLib; - command += " " + hostname; - command += " -"; - AddOmninamesParams(command); + // Step 1 : check if MPI is started + if (_MpiStarted == false) + { + startMPI(); + } + + if (par < 0) + { + // Nodes case + + command = "mpiexec -np " + string(buffer) + " "; + // command += "gdb --args "; + command += real_exe_name; + command += " " + _NS->ContainerName(rtn); + command += " " + parallelLib; + command += " " + hostname; + command += " -"; + AddOmninamesParams(command); + } + else + { + // Proxy case + command = "mpiexec -np 1 "; + command += real_exe_name; + command += " " + _NS->ContainerName(rtn); + command += " " + string(buffer); + command += " " + parallelLib; + command += " " + hostname; + command += " -"; + AddOmninamesParams(command); + } } - } else - { - std::string message("Unknown parallelLib" + parallelLib); - throw SALOME_Exception(message.c_str()); - } + { + std::string message("Unknown parallelLib" + parallelLib); + throw SALOME_Exception(message.c_str()); + } // log choice if (log == "default") - { - command += " > /tmp/"; - command += _NS->ContainerName(rtn); - command += "_"; - command += Kernel_Utils::GetHostname(); - command += "_"; - command += getenv( "USER" ) ; - command += ".log 2>&1 &" ; - } + { + command += " > /tmp/"; + command += _NS->ContainerName(rtn); + command += "_"; + command += Kernel_Utils::GetHostname(); + command += "_"; + command += getenv( "USER" ) ; + command += ".log 2>&1 &" ; + } if (log == "xterm") - { - command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; " - + command + " \" &"; -// + command + "; echo $LD_LIBRARY_PATH; cat \" &"; - } + { + command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; " + + command + " \" &"; + // + command + "; echo $LD_LIBRARY_PATH; cat \" &"; + } return command; -/* if (log == "xterm") - { - command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; echo $LD_LIBRARY_PATH; echo $PATH; " + command + "; cat \" &"; - } -*/ -/* command = "cd ; rm " + fichier_commande + "; touch " + \ - fichier_commande + "; echo \" export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; " + \ - command + " >& /tmp/ribes_" + fichier_commande + " & \" > " + fichier_commande + ";"; - command += "ssh cn01 sh " + fichier_commande + " &"; - cerr << "La commande : " << command << endl; -*/ + /* if (log == "xterm") + { + command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; echo $LD_LIBRARY_PATH; echo $PATH; " + command + "; cat \" &"; + } + */ + /* command = "cd ; rm " + fichier_commande + "; touch " + \ + fichier_commande + "; echo \" export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; " + \ + command + " >& /tmp/ribes_" + fichier_commande + " & \" > " + fichier_commande + ";"; + command += "ssh cn01 sh " + fichier_commande + " &"; + cerr << "La commande : " << command << endl; + */ } void SALOME_ContainerManager::startMPI() @@ -1362,16 +1339,35 @@ void SALOME_ContainerManager::startMPI() int status = system("lamboot"); if (status == -1) - { - INFOS("lamboot failed : system command status -1"); - } + { + INFOS("lamboot failed : system command status -1"); + } else if (status == 217) - { - INFOS("lamboot failed : system command status 217"); - } + { + INFOS("lamboot failed : system command status 217"); + } else - { - _MpiStarted = true; - } + { + _MpiStarted = true; + } } +string SALOME_ContainerManager::GetMPIZeroNode(string machine) +{ + int status; + string zeronode; + string cmd; + string tmpFile = BuildTemporaryFileName(); + + cmd = "ssh " + machine + " mpirun -np 1 hostname > " + tmpFile; + + status = system(cmd.c_str()); + if( status == 0 ){ + ifstream fp(tmpFile.c_str(),ios::in); + fp >> zeronode; + } + + RmTmpFile(tmpFile); + + return zeronode; +} diff --git a/src/Container/SALOME_ContainerManager.hxx b/src/Container/SALOME_ContainerManager.hxx index 7b1f26eab..d57dfba1d 100644 --- a/src/Container/SALOME_ContainerManager.hxx +++ b/src/Container/SALOME_ContainerManager.hxx @@ -50,12 +50,12 @@ public: StartContainer(const Engines::MachineParameters& params, const Engines::MachineList& possibleComputer, Engines::ResPolicy policy, - const std::string& container_exe="SALOME_Container"); + const std::string& container_exe="SALOME_Container"); Engines::Container_ptr StartContainer(const Engines::MachineParameters& params, - Engines::ResPolicy policy, - const Engines::CompoList& componentList); + Engines::ResPolicy policy, + const Engines::CompoList& componentList); Engines::Container_ptr GiveContainer(const Engines::MachineParameters& params, @@ -88,19 +88,17 @@ protected: void fillBatchLaunchedContainers(); - long GetIdForContainer(void); - std::string BuildCommandToLaunchRemoteContainer(const std::string& machine, - const Engines::MachineParameters& params, const long id, - const std::string& container_exe="SALOME_Container"); + const Engines::MachineParameters& params, + const std::string& container_exe="SALOME_Container"); - std::string BuildCommandToLaunchLocalContainer(const Engines::MachineParameters& params, const long id, + std::string BuildCommandToLaunchLocalContainer(const Engines::MachineParameters& params, const std::string& container_exe="SALOME_Container"); std::string BuildTempFileToLaunchRemoteContainer(const std::string& machine, const Engines::MachineParameters& params) throw(SALOME_Exception); - void RmTmpFile(); + void RmTmpFile(std::string& tmpFile); void AddOmninamesParams(std::string& command) const; @@ -108,6 +106,8 @@ protected: std::string BuildTemporaryFileName() const; + std::string GetMPIZeroNode(std::string machine); + // Parallel extension std::string BuildCommandToLaunchLocalParallelContainer(const std::string& exe_name, const Engines::MachineParameters& params, @@ -115,7 +115,6 @@ protected: void startMPI(); bool _MpiStarted; - long _id; CORBA::ORB_var _orb; PortableServer::POA_var _poa; diff --git a/src/MPIContainer/MPIContainer_i.cxx b/src/MPIContainer/MPIContainer_i.cxx index ecdfba30c..c3fd23405 100644 --- a/src/MPIContainer/MPIContainer_i.cxx +++ b/src/MPIContainer/MPIContainer_i.cxx @@ -48,52 +48,22 @@ Engines_MPIContainer_i::Engines_MPIContainer_i(int nbproc, int numproc, int argc, char *argv[]) : Engines_Container_i(orb,poa,containerName,argc,argv,false), MPIObject_i(nbproc,numproc) { - long id=0; - string IdContainerinNS; - char idc[3*sizeof(long)]; - MESSAGE("[" << numproc << "] activate object"); _id = _poa->activate_object(this); - - if(argc>1) - { - for(int i=0;iid_to_reference(*_id); + Engines::Container_var pCont = Engines::Container::_narrow(obj); + _remove_ref(); if(numproc==0){ _NS = new SALOME_NamingService(); _NS->init_orb( CORBA::ORB::_duplicate(_orb) ) ; - CORBA::Object_var obj=_poa->id_to_reference(*_id); - Engines::Container_var pCont = Engines::Container::_narrow(obj); - string hostname = Kernel_Utils::GetHostname(); _containerName = _NS->BuildContainerNameForNS(containerName,hostname.c_str()); SCRUTE(_containerName); _NS->Register(pCont, _containerName.c_str()); - // A parallel container registers in Naming Service - // on the machine where is process 0. ContainerManager does'nt know the name - // of this machine before the launch of the parallel container. So to get - // the IOR of the parallel container in Naming Service, ContainerManager - // gives a unique Id. The parallel container registers his name under - // /ContainerManager/Id directory in NamingService - - IdContainerinNS = "/ContainerManager/id"; - sprintf(idc,"%ld",id); - IdContainerinNS += idc; - SCRUTE(IdContainerinNS); - _NS->Register(pCont, IdContainerinNS.c_str()); - } // Root recupere les ior des container des autre process @@ -121,6 +91,24 @@ void Engines_MPIContainer_i::Shutdown() for(ip= 1;ip<_nbproc;ip++) (Engines::MPIContainer::_narrow((*_tior)[ip]))->Shutdown(); } + + std::map::iterator itm; + for (itm = _listInstances_map.begin(); itm != _listInstances_map.end(); itm++) + { + try + { + itm->second->destroy(); + } + catch(const CORBA::Exception& e) + { + // ignore this entry and continue + } + catch(...) + { + // ignore this entry and continue + } + } + _orb->shutdown(0); } @@ -149,7 +137,6 @@ bool Engines_MPIContainer_i::Lload_component_Library(const char* componentName) // --- try dlopen C++ component string impl_name = string ("lib") + aCompName + string("Engine.so"); - SCRUTE(impl_name); _numInstanceMutex.lock(); // lock to be alone // (see decInstanceCnt, finalize_removal)) @@ -167,12 +154,15 @@ bool Engines_MPIContainer_i::Lload_component_Library(const char* componentName) { _library_map[impl_name] = handle; _numInstanceMutex.unlock(); + MESSAGE("[" << _numproc << "] Library " << impl_name << " loaded"); + MPI_Barrier(MPI_COMM_WORLD); return true; } else { - INFOS("[" << _numproc << "] Can't load shared library : " << impl_name); - INFOS("[" << _numproc << "] error dlopen: " << dlerror()); + MESSAGE("[" << _numproc << "] Can't load shared library : " << impl_name); + MESSAGE("[" << _numproc << "] error dlopen: " << dlerror()); + MPI_Barrier(MPI_COMM_WORLD); } _numInstanceMutex.unlock(); @@ -287,17 +277,15 @@ Engines_MPIContainer_i::Lcreate_component_instance( const char* genericRegisterN //--- try C++ string impl_name = string ("lib") + genericRegisterName +string("Engine.so"); - void* handle = _library_map[impl_name]; - if ( !handle ) { - INFOS("shared library " << impl_name <<"must be loaded before instance"); - return Engines::Component::_nil() ; - } - else { - iobject = createMPIInstance(genericRegisterName, - handle, - studyId); - return iobject._retn(); - } + if (_library_map.count(impl_name) != 0) // C++ component + { + void* handle = _library_map[impl_name]; + iobject = createMPIInstance(genericRegisterName, + handle, + studyId); + return iobject._retn(); + } + } Engines::Component_ptr @@ -311,7 +299,6 @@ Engines_MPIContainer_i::createMPIInstance(string genericRegisterName, string aGenRegisterName = genericRegisterName; string factory_name = aGenRegisterName + string("Engine_factory"); - SCRUTE(factory_name) ; typedef PortableServer::ObjectId * (*MPIFACTORY_FUNCTION) (int,int, @@ -321,17 +308,17 @@ Engines_MPIContainer_i::createMPIInstance(string genericRegisterName, const char *, const char *) ; - MPIFACTORY_FUNCTION MPIComponent_factory - = (MPIFACTORY_FUNCTION) dlsym(handle, factory_name.c_str()); + dlerror(); + MPIFACTORY_FUNCTION MPIComponent_factory = (MPIFACTORY_FUNCTION) dlsym(handle, factory_name.c_str()); - char *error ; - if ( (error = dlerror() ) != NULL) { - // Try to load a sequential component - MESSAGE("[" << _numproc << "] Try to load a sequential component"); - _numInstanceMutex.unlock() ; - iobject = Engines_Container_i::createInstance(genericRegisterName,handle,studyId); - if( CORBA::is_nil(iobject) ) return Engines::Component::_duplicate(iobject); - } + if ( !MPIComponent_factory ) + { + INFOS( "[" << _numproc << "] Can't resolve symbol: " + factory_name ); + SCRUTE( dlerror() ); + pobj = Engines::MPIObject::_nil(); + BCastIOR(_orb,pobj,false); + return Engines::Component::_nil(); + } // --- create instance @@ -370,8 +357,6 @@ Engines_MPIContainer_i::createMPIInstance(string genericRegisterName, //SCRUTE(servant->pd_refCount); _listInstances_map[instanceName] = iobject; _cntInstances_map[aGenRegisterName] += 1; - SCRUTE(aGenRegisterName); - SCRUTE(_cntInstances_map[aGenRegisterName]); //SCRUTE(servant->pd_refCount); bool ret_studyId = servant->setStudyId(studyId); ASSERT(ret_studyId); @@ -387,10 +372,14 @@ Engines_MPIContainer_i::createMPIInstance(string genericRegisterName, BCastIOR(_orb,pobj,false); } - catch (...) - { - INFOS( "Container_i::createInstance exception catched" ) ; - } + catch(const POException &ex){ + INFOS( ex.msg << " on process number " << ex.numproc ) ; + return Engines::Component::_nil(); + } + catch (...){ + INFOS( "Container_i::createInstance exception catched" ) ; + return Engines::Component::_nil(); + } return iobject._retn(); } @@ -450,6 +439,7 @@ Engines::Component_ptr Engines_MPIContainer_i::Lload_impl( string factory_name = _nameToRegister + string("Engine_factory"); MESSAGE("[" << _numproc << "] factory_name=" << factory_name) ; + dlerror(); PortableServer::ObjectId * (*MPIComponent_factory) (int,int, CORBA::ORB_ptr, PortableServer::POA_ptr, diff --git a/src/MPIContainer/MPIObject_i.cxx b/src/MPIContainer/MPIObject_i.cxx index f68a739b6..ff45f5e5e 100644 --- a/src/MPIContainer/MPIObject_i.cxx +++ b/src/MPIContainer/MPIObject_i.cxx @@ -23,10 +23,10 @@ // File : MPIObject_i.cxx // Module : SALOME // -#include #include "MPIObject_i.hxx" #include "utilities.h" using namespace std; +#define TIMEOUT 5 MPIObject_i::MPIObject_i() { @@ -65,7 +65,7 @@ void MPIObject_i::tior(const Engines::IORTab& ior) } void MPIObject_i::BCastIOR(CORBA::ORB_ptr orb, Engines::MPIObject_ptr pobj, - bool amiCont) + bool amiCont) throw(POException) { int err, ip, n; char *ior; @@ -95,13 +95,14 @@ void MPIObject_i::BCastIOR(CORBA::ORB_ptr orb, Engines::MPIObject_ptr pobj, } iort[ip] = orb->string_to_object(ior); delete [] ior; + if(CORBA::is_nil(iort[ip])) + throw POException(ip,"MPI Component not loaded"); } // On donne le tableau des ior a l'objet Corba du process 0 if( amiCont ) tior(*(iort._retn())); else pobj->tior(*(iort._retn())); - } else{ // Conversion IOR vers string @@ -123,3 +124,78 @@ void MPIObject_i::BCastIOR(CORBA::ORB_ptr orb, Engines::MPIObject_ptr pobj, } +#ifdef HAVE_MPI2 +MPI_Comm MPIObject_i::remoteMPI2Connect(string service) throw(POException) +{ + int i; + MPI_Comm gcom; + char port_name_clt[MPI_MAX_PORT_NAME]; + + _srv = 0; + _service = service; + + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN); + if( _numproc == 0 ){ + /* rank 0 try to be a server. If service is already published, try to be a cient */ + MPI_Open_port(MPI_INFO_NULL, _port_name); + if ( MPI_Publish_name((char*)_service.c_str(), MPI_INFO_NULL, _port_name) == MPI_SUCCESS ) { + _srv = 1; + MESSAGE("[" << _numproc << "] service " << _service << " available at " << _port_name << "\n"); + } + else if ( MPI_Lookup_name((char*)_service.c_str(), MPI_INFO_NULL, port_name_clt) == MPI_SUCCESS ){ + MESSAGE("[" << _numproc << "] I get the connection with " << _service << " at " << port_name_clt << "!\n"); + MPI_Close_port( _port_name ); + } + else{ + /* Throw exception */ + MESSAGE("[" << _numproc << "] Error on connection with " << _service << " at " << port_name_clt << "!\n"); + throw POException(_numproc,"Error on connection with " + _service + " at " + port_name_clt); + } + } + else{ + i=0; + /* Waiting rank 0 publish name and try to be a client */ + while ( i != TIMEOUT ) { + sleep(1); + if ( MPI_Lookup_name((char*)_service.c_str(), MPI_INFO_NULL, port_name_clt) == MPI_SUCCESS ){ + MESSAGE("[" << _numproc << "] I get the connection with " << _service << " at " << port_name_clt << "!\n"); + break; + } + i++; + } + if(i==TIMEOUT){ + /* Throw exception */ + MESSAGE("[" << _numproc << "] Error on connection with " << _service << " at " << port_name_clt << "!\n"); + throw POException(_numproc,"Error on connection with " + _service + " at " + port_name_clt); + } + } + MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_ARE_FATAL); + + /* If rank 0 is server, all processes call MPI_Comm_accept */ + /* If rank 0 is not server, all processes call MPI_Comm_connect */ + MPI_Bcast(&_srv,1,MPI_INT,0,MPI_COMM_WORLD); + if ( _srv ) + MPI_Comm_accept( _port_name, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &gcom ); + else + MPI_Comm_connect(port_name_clt, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &gcom ); + + /* only rank 0 can be server for unpublish name */ + if(_numproc != 0) _srv = 0; + + return gcom; + +} + +void MPIObject_i::remoteMPI2Disconnect(MPI_Comm gcom) +{ + MPI_Comm_disconnect( &gcom ); + if ( _srv ) { + MPI_Unpublish_name((char*)_service.c_str(), MPI_INFO_NULL, _port_name); + MESSAGE("[" << _numproc << "] " << _service << ": close port " << _port_name << "\n"); + MPI_Close_port( _port_name ); + } +} +#endif + diff --git a/src/MPIContainer/MPIObject_i.hxx b/src/MPIContainer/MPIObject_i.hxx index 21ea9d84c..641cb1138 100644 --- a/src/MPIContainer/MPIObject_i.hxx +++ b/src/MPIContainer/MPIObject_i.hxx @@ -26,8 +26,19 @@ #ifndef _SALOME_POBJECT_I_H_ #define _SALOME_POBJECT_I_H_ +#include +#include #include #include CORBA_SERVER_HEADER(SALOME_MPIObject) +#define defaultService "SERVER" + +class POException +{ +public: + const std::string msg; + const int numproc; + POException(const int ip,const std::string m) : numproc(ip),msg(m) {} +}; class MPIObject_i: public POA_Engines::MPIObject { @@ -47,7 +58,19 @@ class MPIObject_i: public POA_Engines::MPIObject // IOR des objets paralleles sur tous les process mpi Engines::IORTab* _tior; // Echange des IOR de l'objet entre process - void BCastIOR(CORBA::ORB_ptr orb,Engines::MPIObject_ptr pobj,bool amiCont); + void BCastIOR(CORBA::ORB_ptr orb,Engines::MPIObject_ptr pobj,bool amiCont) throw(POException); +#ifdef HAVE_MPI2 + // MPI2 connection + MPI_Comm remoteMPI2Connect(std::string service=defaultService) throw(POException); + // MPI2 disconnection + void remoteMPI2Disconnect(MPI_Comm gcom); +#endif + +private: + int _srv; + char _port_name[MPI_MAX_PORT_NAME]; + std::string _service; + } ; #endif diff --git a/src/MPIContainer/SALOME_MPIContainer.cxx b/src/MPIContainer/SALOME_MPIContainer.cxx index 647e54470..c288ec8cf 100644 --- a/src/MPIContainer/SALOME_MPIContainer.cxx +++ b/src/MPIContainer/SALOME_MPIContainer.cxx @@ -83,7 +83,7 @@ int main(int argc, char* argv[]) } MESSAGE("[" << numproc << "] MPIContainer: load MPIContainer servant"); - myContainer = new Engines_MPIContainer_i(nbproc,numproc,orb,factory_poa, containerName,argc,argv); + new Engines_MPIContainer_i(nbproc,numproc,orb,factory_poa, containerName,argc,argv); pman->activate(); @@ -106,13 +106,11 @@ int main(int argc, char* argv[]) INFOS("Caught unknown exception."); } - if(myContainer) - delete myContainer; + MPI_Finalize(); END_OF("[" << numproc << "] " << argv[0]); - // delete myThreadTrace; - MPI_Finalize(); + exit(0); } -- 2.39.2