From 16a97a4234b4a8f8737e5985587a6e65b070d30c Mon Sep 17 00:00:00 2001 From: prascle Date: Wed, 30 Nov 2005 19:59:11 +0000 Subject: [PATCH] PR: mpi containers portability --- salome_adm/unix/config_files/check_lam.m4 | 46 +++++++++++++++--- src/Communication/ReceiverFactory.cxx | 3 ++ src/Communication/Receivers.hxx | 4 +- src/Communication/SALOME_Comm_i.cxx | 2 +- src/Communication/SALOME_Comm_i.hxx | 6 +-- src/Communication/SenderFactory.cxx | 2 +- src/Communication_SWIG/libSALOME_Comm.i | 3 ++ src/Container/SALOME_Container.cxx | 8 ++-- src/Container/SALOME_ContainerManager.cxx | 48 +++++++++++++++++-- src/Container/SALOME_ContainerManager.hxx | 3 ++ src/MPIContainer/MPIContainer_i.cxx | 37 ++++++++++++-- src/MPIContainer/MPIObject_i.cxx | 2 +- src/MPIContainer/SALOME_MPIContainer.cxx | 2 +- src/NamingService/SALOME_NamingService.cxx | 28 +++++++---- .../SALOME_ResourcesManager.cxx | 47 +++++++++++++++--- .../SALOME_ResourcesManager.hxx | 4 +- 16 files changed, 200 insertions(+), 45 deletions(-) diff --git a/salome_adm/unix/config_files/check_lam.m4 b/salome_adm/unix/config_files/check_lam.m4 index bde79d7e6..ae21b56fe 100644 --- a/salome_adm/unix/config_files/check_lam.m4 +++ b/salome_adm/unix/config_files/check_lam.m4 @@ -59,17 +59,51 @@ if test "$WITHLAM" = yes; then LIBS_old="$LIBS" LDFLAGS_old="$LDFLAGS" LDFLAGS="$MPI_LIBS $LDFLAGS" - AC_CHECK_LIB(lam,lam_mp_init,,WITHLAM="no") - AC_CHECK_LIB(mpi,MPI_Init,WITHLAM="yes",WITHLAM="no") - AC_CHECK_LIB(mpi,MPI_Publish_name,WITHMPI2="yes",WITHMPI2="no") - LDFLAGS="$LDFLAGS_old" - LIBS="$LIBS_old" + fi + + if test "$WITHLAM" = "yes";then + WITHLAM="no" + + if test "$WITHLAM" = "no";then + CPPFLAGS="$MPI_INCLUDES $CPPFLAGS" + LIBS="$LIBS -lmpi++" + AC_TRY_LINK([ + #include + ], [int argc=0; char **argv=0; MPI_Init(&argc,&argv);], + WITHLAM="yes",WITHLAM="no") + if test "$WITHLAM" = "yes";then + MPI_LIBS="$MPI_LIBS -lmpi++" + fi + LIBS="$LIBS_old" + CPPFLAGS="$CPPFLAGS_old" + + AC_CHECK_LIB(mpi++,MPI_Publish_name,WITHMPI2="yes",WITHMPI2="no") + LDFLAGS="$LDFLAGS_old" + LIBS="$LIBS_old" + fi + + if test "$WITHLAM" = "no";then + AC_CHECK_LIB(lam,lam_mp_init,WITHLAM="yes",WITHLAM="no") + if test "$WITHLAM" = "yes";then + MPI_LIBS="$MPI_LIBS -llam" + LIBS="$LIBS -llam" + fi + + AC_CHECK_LIB(mpi,MPI_Init,WITHLAM="yes",WITHLAM="no") + if test "$WITHLAM" = "yes";then + MPI_LIBS="$MPI_LIBS -lmpi" + fi + + AC_CHECK_LIB(mpi,MPI_Publish_name,WITHMPI2="yes",WITHMPI2="no") + LDFLAGS="$LDFLAGS_old" + LIBS="$LIBS_old" + fi fi if test "$WITHLAM" = "yes";then WITHMPI="yes" mpi_ok=yes - MPI_LIBS="$MPI_LIBS -llammpi++" + CPPFLAGS="-DWITHLAM $CPPFLAGS" else mpi_ok=no fi diff --git a/src/Communication/ReceiverFactory.cxx b/src/Communication/ReceiverFactory.cxx index 1a2bd9125..94e52b256 100644 --- a/src/Communication/ReceiverFactory.cxx +++ b/src/Communication/ReceiverFactory.cxx @@ -1,3 +1,6 @@ +#ifdef HAVE_MPI2 +#include "mpi.h" +#endif #include "ReceiverFactory.hxx" #include "Receivers.hxx" using namespace std; diff --git a/src/Communication/Receivers.hxx b/src/Communication/Receivers.hxx index c4309b754..c02d9fffc 100644 --- a/src/Communication/Receivers.hxx +++ b/src/Communication/Receivers.hxx @@ -1,11 +1,11 @@ #ifndef _RECEIVERS_HXX_ #define _RECEIVERS_HXX_ -#include "SALOME_Comm_i.hxx" -#include "Receiver.hxx" #ifdef HAVE_MPI2 #include "mpi.h" #endif +#include "SALOME_Comm_i.hxx" +#include "Receiver.hxx" /*! Receiver used for transfert with CORBA when no copy is required remotely and locally. diff --git a/src/Communication/SALOME_Comm_i.cxx b/src/Communication/SALOME_Comm_i.cxx index 71fbdadf8..7eb8513f7 100644 --- a/src/Communication/SALOME_Comm_i.cxx +++ b/src/Communication/SALOME_Comm_i.cxx @@ -1,7 +1,7 @@ +#include "SALOME_Comm_i.hxx" #ifndef WNT #include #endif -#include "SALOME_Comm_i.hxx" #include "poa.h" #include "omnithread.h" #include "Utils_SINGLETON.hxx" diff --git a/src/Communication/SALOME_Comm_i.hxx b/src/Communication/SALOME_Comm_i.hxx index f7f196c62..1a8507ab0 100644 --- a/src/Communication/SALOME_Comm_i.hxx +++ b/src/Communication/SALOME_Comm_i.hxx @@ -1,12 +1,12 @@ #ifndef _SALOME_COMM_I_HXX_ #define _SALOME_COMM_I_HXX_ -#include -#include -#include CORBA_SERVER_HEADER(SALOME_Comm) #ifdef HAVE_MPI2 #include "mpi.h" #endif +#include +#include +#include CORBA_SERVER_HEADER(SALOME_Comm) #define TIMEOUT 20 diff --git a/src/Communication/SenderFactory.cxx b/src/Communication/SenderFactory.cxx index 04f8056d0..ef1e956e0 100644 --- a/src/Communication/SenderFactory.cxx +++ b/src/Communication/SenderFactory.cxx @@ -1,7 +1,7 @@ +#include "SALOME_Comm_i.hxx" #include "SenderFactory.hxx" #include "utilities.h" #include "SALOMEMultiComm.hxx" -#include "SALOME_Comm_i.hxx" using namespace std; #ifdef COMP_CORBA_DOUBLE diff --git a/src/Communication_SWIG/libSALOME_Comm.i b/src/Communication_SWIG/libSALOME_Comm.i index 42d4d5a3e..e96cc3eb0 100644 --- a/src/Communication_SWIG/libSALOME_Comm.i +++ b/src/Communication_SWIG/libSALOME_Comm.i @@ -2,6 +2,9 @@ %{ #include "ReceiverFactory.hxx" + #undef SEEK_SET + #undef SEEK_CUR + #undef SEEK_END #include "SALOME_Comm_i.hxx" %} diff --git a/src/Container/SALOME_Container.cxx b/src/Container/SALOME_Container.cxx index f6f1884ff..a97bb39dc 100644 --- a/src/Container/SALOME_Container.cxx +++ b/src/Container/SALOME_Container.cxx @@ -26,6 +26,10 @@ // Module : SALOME // $Header$ +#ifdef HAVE_MPI2 +#include +#endif + #include #include #include @@ -46,10 +50,6 @@ #include #endif -#ifdef HAVE_MPI2 -#include -#endif - #include "Container_init_python.hxx" using namespace std; diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index b7a3c708e..0e5e2cde8 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -29,6 +29,7 @@ SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb) MESSAGE("constructor"); _NS = new SALOME_NamingService(orb); _ResManager = new SALOME_ResourcesManager(orb); + _id=0; PortableServer::POA_var root_poa = PortableServer::POA::_the_root_poa(); PortableServer::POAManager_var pman = root_poa->the_POAManager(); PortableServer::POA_var my_poa; @@ -118,6 +119,10 @@ SALOME_ContainerManager:: FindOrStartContainer(const Engines::MachineParameters& params, const Engines::MachineList& possibleComputers) { + long id; + string containerNameInNS; + char idc[sizeof(long)+1]; + Engines::Container_ptr ret = FindContainer(params,possibleComputers); if(!CORBA::is_nil(ret)) return ret; @@ -128,6 +133,15 @@ FindOrStartContainer(const Engines::MachineParameters& params, string theMachine=_ResManager->FindBest(possibleComputers); MESSAGE("try to launch it on " << theMachine); + // Get Id for container: a parallel container registers in Naming Service + // on the machine where is process 0. ContainerManager does'nt know the name + // of this machine before the launch of the parallel container. So to get + // the IOR of the parallel container in Naming Service, ContainerManager + // gives a unique Id. The parallel container registers his name under + // /ContainerManager/Id directory in NamingService + + id = GetIdForContainer(); + string command; if(theMachine=="") { @@ -137,11 +151,11 @@ FindOrStartContainer(const Engines::MachineParameters& params, } else if(theMachine==GetHostname()) { - command=_ResManager->BuildCommandToLaunchLocalContainer(params); + command=_ResManager->BuildCommandToLaunchLocalContainer(params,id); } else command = - _ResManager->BuildCommandToLaunchRemoteContainer(theMachine,params); + _ResManager->BuildCommandToLaunchRemoteContainer(theMachine,params,id); _ResManager->RmTmpFile(); int status=system(command.c_str()); @@ -170,8 +184,15 @@ FindOrStartContainer(const Engines::MachineParameters& params, count-- ; if ( count != 10 ) MESSAGE( count << ". Waiting for FactoryServer on " << theMachine); - string containerNameInNS = - _NS->BuildContainerNameForNS(params,theMachine.c_str()); + if(params.isMPI) + { + containerNameInNS = "/ContainerManager/id"; + sprintf(idc,"%ld",id); + containerNameInNS += idc; + } + else + containerNameInNS = + _NS->BuildContainerNameForNS(params,theMachine.c_str()); SCRUTE(containerNameInNS); CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str()); ret=Engines::Container::_narrow(obj); @@ -273,3 +294,22 @@ FindContainer(const Engines::MachineParameters& params, MESSAGE("FindContainer: not found"); return Engines::Container::_nil(); } + +//============================================================================= +/*! + * Get Id for container: a parallel container registers in Naming Service + * on the machine where is process 0. ContainerManager does'nt know the name + * of this machine before the launch of the parallel container. So to get + * the IOR of the parallel container in Naming Service, ContainerManager + * gives a unique Id. The parallel container registers his name under + * /ContainerManager/Id directory in NamingService + */ +//============================================================================= + + +long SALOME_ContainerManager::GetIdForContainer(void) +{ + _id++; + return _id; +} + diff --git a/src/Container/SALOME_ContainerManager.hxx b/src/Container/SALOME_ContainerManager.hxx index f8311e9f7..1217db07b 100644 --- a/src/Container/SALOME_ContainerManager.hxx +++ b/src/Container/SALOME_ContainerManager.hxx @@ -58,6 +58,9 @@ private: FindContainer(const Engines::MachineParameters& params, const char *theMachine); + long GetIdForContainer(void); + long _id; + SALOME_ResourcesManager *_ResManager; SALOME_NamingService *_NS; }; diff --git a/src/MPIContainer/MPIContainer_i.cxx b/src/MPIContainer/MPIContainer_i.cxx index 3b97ad793..8b7309d84 100644 --- a/src/MPIContainer/MPIContainer_i.cxx +++ b/src/MPIContainer/MPIContainer_i.cxx @@ -45,25 +45,52 @@ Engines_MPIContainer_i::Engines_MPIContainer_i(int nbproc, int numproc, int argc, char *argv[]) : Engines_Container_i(orb,poa,containerName,argc,argv,false), MPIObject_i(nbproc,numproc) { + long id=0; + string IdContainerinNS; + char idc[sizeof(long)+1]; + MESSAGE("[" << numproc << "] activate object"); _id = _poa->activate_object(this); -// this->_add_ref(); + + if(argc>1) + { + for(int i=0;i::Instance() ; -// ASSERT(SINGLETON_::IsAlreadyExisting()) ; _NS->init_orb( CORBA::ORB::_duplicate(_orb) ) ; -// Engines::Container_ptr pCont -// = Engines::Container::_narrow(POA_Engines::MPIContainer::_this()); CORBA::Object_var obj=_poa->id_to_reference(*_id); Engines::Container_var pCont = Engines::Container::_narrow(obj); + string hostname = GetHostname(); _containerName = _NS->BuildContainerNameForNS(containerName,hostname.c_str()); SCRUTE(_containerName); _NS->Register(pCont, _containerName.c_str()); + + // A parallel container registers in Naming Service + // on the machine where is process 0. ContainerManager does'nt know the name + // of this machine before the launch of the parallel container. So to get + // the IOR of the parallel container in Naming Service, ContainerManager + // gives a unique Id. The parallel container registers his name under + // /ContainerManager/Id directory in NamingService + + IdContainerinNS = "/ContainerManager/id"; + sprintf(idc,"%ld",id); + IdContainerinNS += idc; + SCRUTE(IdContainerinNS); + _NS->Register(pCont, IdContainerinNS.c_str()); + } // Root recupere les ior des container des autre process diff --git a/src/MPIContainer/MPIObject_i.cxx b/src/MPIContainer/MPIObject_i.cxx index 0da19e33c..ab6e1a386 100644 --- a/src/MPIContainer/MPIObject_i.cxx +++ b/src/MPIContainer/MPIObject_i.cxx @@ -24,9 +24,9 @@ // File : MPIObject_i.cxx // Module : SALOME +#include #include "MPIObject_i.hxx" #include "utilities.h" -#include using namespace std; MPIObject_i::MPIObject_i() diff --git a/src/MPIContainer/SALOME_MPIContainer.cxx b/src/MPIContainer/SALOME_MPIContainer.cxx index bf2322190..3932ff3d4 100644 --- a/src/MPIContainer/SALOME_MPIContainer.cxx +++ b/src/MPIContainer/SALOME_MPIContainer.cxx @@ -1,9 +1,9 @@ +#include #include #include "MPIContainer_i.hxx" #include "Utils_ORB_INIT.hxx" #include "Utils_SINGLETON.hxx" #include "utilities.h" -#include #include "SALOMETraceCollector.hxx" using namespace std; diff --git a/src/NamingService/SALOME_NamingService.cxx b/src/NamingService/SALOME_NamingService.cxx index 36b0961a1..e0ad5fcd5 100644 --- a/src/NamingService/SALOME_NamingService.cxx +++ b/src/NamingService/SALOME_NamingService.cxx @@ -543,6 +543,16 @@ SALOME_NamingService::ResolveComponent(const char* hostname, for (unsigned int ind = 0; ind < contList.size(); ind++) { name = contList[ind].c_str(); + + if ( nbproc >= 1 ) + { + char *str_nbproc = new char[8]; + sprintf(str_nbproc, "_%d", nbproc); + if( strstr(name.c_str(),str_nbproc) == NULL) + continue; // check only containers with _%d in name + delete [] str_nbproc; + } + name += "/"; name += componentName; SCRUTE(name); @@ -1458,15 +1468,17 @@ throw(ServiceUnreachable) void SALOME_NamingService::Destroy_FullDirectory(const char* Path) throw(ServiceUnreachable) { - Change_Directory(Path); - vector contList = list_directory(); - - for (unsigned int ind = 0; ind < contList.size(); ind++) - Destroy_Name(contList[ind].c_str()); - - Destroy_Directory(Path); + if( Change_Directory(Path) ) + { + vector contList = list_directory(); - Destroy_Name(Path); + for (unsigned int ind = 0; ind < contList.size(); ind++) + Destroy_Name(contList[ind].c_str()); + + Destroy_Directory(Path); + + Destroy_Name(Path); + } } // ============================================================================ diff --git a/src/ResourcesManager/SALOME_ResourcesManager.cxx b/src/ResourcesManager/SALOME_ResourcesManager.cxx index 26f9d2de7..8a92adefc 100644 --- a/src/ResourcesManager/SALOME_ResourcesManager.cxx +++ b/src/ResourcesManager/SALOME_ResourcesManager.cxx @@ -357,10 +357,12 @@ bool isPythonContainer(const char* ContainerName) string SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer (const string& machine, - const Engines::MachineParameters& params) + const Engines::MachineParameters& params, const long id) { string command; - + int nbproc; + char idc[sizeof(long)+1]; + if ( ! _isAppliSalomeDefined ) command = BuildTempFileToLaunchRemoteContainer(machine, params); @@ -370,8 +372,6 @@ SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer if (params.isMPI) { - int nbproc; - if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) ) nbproc = 1; else if ( params.nb_node == 0 ) @@ -418,9 +418,33 @@ SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer ASSERT(getenv("NSPORT")); command += getenv("NSPORT"); // port of CORBA name server - command += " SALOME_Container "; + if(params.isMPI) + { + command += " mpirun -np "; + std::ostringstream o; + o << nbproc << " "; + command += o.str(); +#ifdef WITHLAM + command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#endif + command += " SALOME_MPIContainer "; + } + else + command += " SALOME_Container "; + + command += _NS->ContainerName(params); + command += " -id "; + sprintf(idc,"%ld",id); + command += idc; + command += " -"; + AddOmninamesParams(command); + command += " > /tmp/"; command += _NS->ContainerName(params); - command += "&"; + command += "_"; + command += GetHostname(); + command += "_"; + command += getenv( "USER" ) ; + command += ".log 2>&1 &" ; MESSAGE("command =" << command); } @@ -437,11 +461,12 @@ SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer string SALOME_ResourcesManager::BuildCommandToLaunchLocalContainer -(const Engines::MachineParameters& params) +(const Engines::MachineParameters& params, const long id) { _TmpFileName = ""; string command; int nbproc = 0; + char idc[sizeof(long)+1]; if (params.isMPI) { @@ -461,7 +486,9 @@ SALOME_ResourcesManager::BuildCommandToLaunchLocalContainer o << nbproc << " "; command += o.str(); +#ifdef WITHLAM command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#endif if (isPythonContainer(params.container_name)) command += "pyMPI SALOME_ContainerPy.py "; @@ -478,6 +505,9 @@ SALOME_ResourcesManager::BuildCommandToLaunchLocalContainer } command += _NS->ContainerName(params); + command += " -id "; + sprintf(idc,"%ld",id); + command += idc; command += " -"; AddOmninamesParams(command); command += " > /tmp/"; @@ -771,6 +801,9 @@ SALOME_ResourcesManager::BuildTempFileToLaunchRemoteContainer std::ostringstream o; tempOutputFile << nbproc << " "; +#ifdef WITHLAM + tempOutputFile << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#endif } tempOutputFile << (*(resInfo.ModulesPath.find("KERNEL"))).second diff --git a/src/ResourcesManager/SALOME_ResourcesManager.hxx b/src/ResourcesManager/SALOME_ResourcesManager.hxx index 029c9fa9e..b41e0d24a 100644 --- a/src/ResourcesManager/SALOME_ResourcesManager.hxx +++ b/src/ResourcesManager/SALOME_ResourcesManager.hxx @@ -53,10 +53,10 @@ class RESOURCESMANAGER_EXPORT SALOME_ResourcesManager std::string BuildCommandToLaunchRemoteContainer (const std::string& machine, - const Engines::MachineParameters& params); + const Engines::MachineParameters& params, const long id); std::string BuildCommandToLaunchLocalContainer - (const Engines::MachineParameters& params); + (const Engines::MachineParameters& params, const long id); void RmTmpFile(); -- 2.39.2