From 639b0a8bed2aebe0f3cecac5e6cb33eeceeb834b Mon Sep 17 00:00:00 2001 From: secher Date: Tue, 29 Nov 2005 14:13:03 +0000 Subject: [PATCH] debug of MPI containers --- salome_adm/unix/config_files/check_lam.m4 | 3 +- src/Communication/ReceiverFactory.cxx | 3 ++ src/Communication/Receivers.hxx | 4 +- src/Communication/SALOME_Comm_i.cxx | 2 +- src/Communication/SALOME_Comm_i.hxx | 6 +-- src/Communication/SenderFactory.cxx | 2 +- src/Communication_SWIG/libSALOME_Comm.i | 3 ++ src/Container/SALOME_Container.cxx | 8 ++-- src/Container/SALOME_ContainerManager.cxx | 35 ++++++++++++-- src/Container/SALOME_ContainerManager.hxx | 2 + src/MPIContainer/MPIContainer_i.cxx | 33 +++++++++++-- src/MPIContainer/MPIObject_i.cxx | 2 +- src/MPIContainer/SALOME_MPIContainer.cxx | 2 +- src/NamingService/SALOME_NamingService.cxx | 22 ++++++--- .../SALOME_ResourcesManager.cxx | 47 +++++++++++++++---- .../SALOME_ResourcesManager.hxx | 4 +- 16 files changed, 139 insertions(+), 39 deletions(-) diff --git a/salome_adm/unix/config_files/check_lam.m4 b/salome_adm/unix/config_files/check_lam.m4 index bde79d7e6..3d8d9d8b6 100644 --- a/salome_adm/unix/config_files/check_lam.m4 +++ b/salome_adm/unix/config_files/check_lam.m4 @@ -69,7 +69,8 @@ if test "$WITHLAM" = yes; then if test "$WITHLAM" = "yes";then WITHMPI="yes" mpi_ok=yes - MPI_LIBS="$MPI_LIBS -llammpi++" + MPI_LIBS="$MPI_LIBS -lmpi -llam" + CPPFLAGS="-DWITHLAM $CPPFLAGS" else mpi_ok=no fi diff --git a/src/Communication/ReceiverFactory.cxx b/src/Communication/ReceiverFactory.cxx index 1a2bd9125..94e52b256 100644 --- a/src/Communication/ReceiverFactory.cxx +++ b/src/Communication/ReceiverFactory.cxx @@ -1,3 +1,6 @@ +#ifdef HAVE_MPI2 +#include "mpi.h" +#endif #include "ReceiverFactory.hxx" #include "Receivers.hxx" using namespace std; diff --git a/src/Communication/Receivers.hxx b/src/Communication/Receivers.hxx index c4309b754..c02d9fffc 100644 --- a/src/Communication/Receivers.hxx +++ b/src/Communication/Receivers.hxx @@ -1,11 +1,11 @@ #ifndef _RECEIVERS_HXX_ #define _RECEIVERS_HXX_ -#include "SALOME_Comm_i.hxx" -#include "Receiver.hxx" #ifdef HAVE_MPI2 #include "mpi.h" #endif +#include "SALOME_Comm_i.hxx" +#include "Receiver.hxx" /*! Receiver used for transfert with CORBA when no copy is required remotely and locally. diff --git a/src/Communication/SALOME_Comm_i.cxx b/src/Communication/SALOME_Comm_i.cxx index 71fbdadf8..7eb8513f7 100644 --- a/src/Communication/SALOME_Comm_i.cxx +++ b/src/Communication/SALOME_Comm_i.cxx @@ -1,7 +1,7 @@ +#include "SALOME_Comm_i.hxx" #ifndef WNT #include #endif -#include "SALOME_Comm_i.hxx" #include "poa.h" #include "omnithread.h" #include "Utils_SINGLETON.hxx" diff --git a/src/Communication/SALOME_Comm_i.hxx b/src/Communication/SALOME_Comm_i.hxx index f7f196c62..1a8507ab0 100644 --- a/src/Communication/SALOME_Comm_i.hxx +++ b/src/Communication/SALOME_Comm_i.hxx @@ -1,12 +1,12 @@ #ifndef _SALOME_COMM_I_HXX_ #define _SALOME_COMM_I_HXX_ -#include -#include -#include CORBA_SERVER_HEADER(SALOME_Comm) #ifdef HAVE_MPI2 #include "mpi.h" #endif +#include +#include +#include CORBA_SERVER_HEADER(SALOME_Comm) #define TIMEOUT 20 diff --git a/src/Communication/SenderFactory.cxx b/src/Communication/SenderFactory.cxx index 04f8056d0..ef1e956e0 100644 --- a/src/Communication/SenderFactory.cxx +++ b/src/Communication/SenderFactory.cxx @@ -1,7 +1,7 @@ +#include "SALOME_Comm_i.hxx" #include "SenderFactory.hxx" #include "utilities.h" #include "SALOMEMultiComm.hxx" -#include "SALOME_Comm_i.hxx" using namespace std; #ifdef COMP_CORBA_DOUBLE diff --git a/src/Communication_SWIG/libSALOME_Comm.i b/src/Communication_SWIG/libSALOME_Comm.i index 42d4d5a3e..e96cc3eb0 100644 --- a/src/Communication_SWIG/libSALOME_Comm.i +++ b/src/Communication_SWIG/libSALOME_Comm.i @@ -2,6 +2,9 @@ %{ #include "ReceiverFactory.hxx" + #undef SEEK_SET + #undef SEEK_CUR + #undef SEEK_END #include "SALOME_Comm_i.hxx" %} diff --git a/src/Container/SALOME_Container.cxx b/src/Container/SALOME_Container.cxx index f6f1884ff..a97bb39dc 100644 --- a/src/Container/SALOME_Container.cxx +++ b/src/Container/SALOME_Container.cxx @@ -26,6 +26,10 @@ // Module : SALOME // $Header$ +#ifdef HAVE_MPI2 +#include +#endif + #include #include #include @@ -46,10 +50,6 @@ #include #endif -#ifdef HAVE_MPI2 -#include -#endif - #include "Container_init_python.hxx" using namespace std; diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index b7a3c708e..c7aab78a1 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -118,6 +118,10 @@ SALOME_ContainerManager:: FindOrStartContainer(const Engines::MachineParameters& params, const Engines::MachineList& possibleComputers) { + long id; + string containerNameInNS; + char idc[64]; + Engines::Container_ptr ret = FindContainer(params,possibleComputers); if(!CORBA::is_nil(ret)) return ret; @@ -128,6 +132,14 @@ FindOrStartContainer(const Engines::MachineParameters& params, string theMachine=_ResManager->FindBest(possibleComputers); MESSAGE("try to launch it on " << theMachine); + // Get Id for container: a parallel container register in Naming Service + // on the machine where is process 0. ContainerManager does'nt know the name + // of this machine before the launch of the parallel container. So to get + // the IOR of the parallel container in Naming Service, ContainerManager + // give an Id. The parallel container register his name under + // /ContainerManager/Id directory in NamingService + id = GetIdForContainer(); + string command; if(theMachine=="") { @@ -137,11 +149,11 @@ FindOrStartContainer(const Engines::MachineParameters& params, } else if(theMachine==GetHostname()) { - command=_ResManager->BuildCommandToLaunchLocalContainer(params); + command=_ResManager->BuildCommandToLaunchLocalContainer(params,id); } else command = - _ResManager->BuildCommandToLaunchRemoteContainer(theMachine,params); + _ResManager->BuildCommandToLaunchRemoteContainer(theMachine,params,id); _ResManager->RmTmpFile(); int status=system(command.c_str()); @@ -170,8 +182,14 @@ FindOrStartContainer(const Engines::MachineParameters& params, count-- ; if ( count != 10 ) MESSAGE( count << ". Waiting for FactoryServer on " << theMachine); - string containerNameInNS = - _NS->BuildContainerNameForNS(params,theMachine.c_str()); + if(params.isMPI){ + containerNameInNS = "/ContainerManager/id"; + sprintf(idc,"%ld",id); + containerNameInNS += idc; + } + else + containerNameInNS = + _NS->BuildContainerNameForNS(params,theMachine.c_str()); SCRUTE(containerNameInNS); CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str()); ret=Engines::Container::_narrow(obj); @@ -273,3 +291,12 @@ FindContainer(const Engines::MachineParameters& params, MESSAGE("FindContainer: not found"); return Engines::Container::_nil(); } + + +long SALOME_ContainerManager::GetIdForContainer(void) +{ + long id; + id = rand(); + return id; +} + diff --git a/src/Container/SALOME_ContainerManager.hxx b/src/Container/SALOME_ContainerManager.hxx index f8311e9f7..96114f892 100644 --- a/src/Container/SALOME_ContainerManager.hxx +++ b/src/Container/SALOME_ContainerManager.hxx @@ -58,6 +58,8 @@ private: FindContainer(const Engines::MachineParameters& params, const char *theMachine); + long GetIdForContainer(void); + SALOME_ResourcesManager *_ResManager; SALOME_NamingService *_NS; }; diff --git a/src/MPIContainer/MPIContainer_i.cxx b/src/MPIContainer/MPIContainer_i.cxx index 3b97ad793..635f137d6 100644 --- a/src/MPIContainer/MPIContainer_i.cxx +++ b/src/MPIContainer/MPIContainer_i.cxx @@ -45,25 +45,48 @@ Engines_MPIContainer_i::Engines_MPIContainer_i(int nbproc, int numproc, int argc, char *argv[]) : Engines_Container_i(orb,poa,containerName,argc,argv,false), MPIObject_i(nbproc,numproc) { + long id=0; + string IdContainerinNS; + char idc[64]; + MESSAGE("[" << numproc << "] activate object"); _id = _poa->activate_object(this); -// this->_add_ref(); + + if(argc>1){ + for(int i=0;i::Instance() ; -// ASSERT(SINGLETON_::IsAlreadyExisting()) ; _NS->init_orb( CORBA::ORB::_duplicate(_orb) ) ; -// Engines::Container_ptr pCont -// = Engines::Container::_narrow(POA_Engines::MPIContainer::_this()); CORBA::Object_var obj=_poa->id_to_reference(*_id); Engines::Container_var pCont = Engines::Container::_narrow(obj); + string hostname = GetHostname(); _containerName = _NS->BuildContainerNameForNS(containerName,hostname.c_str()); SCRUTE(_containerName); _NS->Register(pCont, _containerName.c_str()); + + // A parallel container register in Naming Service + // on the machine where is process 0. ContainerManager does'nt know the name + // of this machine before the launch of the parallel container. So to get + // the IOR of the parallel container in Naming Service, ContainerManager + // give an Id. The parallel container register his name under + // /ContainerManager/Id directory in NamingService + IdContainerinNS = "/ContainerManager/id"; + sprintf(idc,"%ld",id); + IdContainerinNS += idc; + SCRUTE(IdContainerinNS); + _NS->Register(pCont, IdContainerinNS.c_str()); + } // Root recupere les ior des container des autre process diff --git a/src/MPIContainer/MPIObject_i.cxx b/src/MPIContainer/MPIObject_i.cxx index 0da19e33c..ab6e1a386 100644 --- a/src/MPIContainer/MPIObject_i.cxx +++ b/src/MPIContainer/MPIObject_i.cxx @@ -24,9 +24,9 @@ // File : MPIObject_i.cxx // Module : SALOME +#include #include "MPIObject_i.hxx" #include "utilities.h" -#include using namespace std; MPIObject_i::MPIObject_i() diff --git a/src/MPIContainer/SALOME_MPIContainer.cxx b/src/MPIContainer/SALOME_MPIContainer.cxx index bf2322190..3932ff3d4 100644 --- a/src/MPIContainer/SALOME_MPIContainer.cxx +++ b/src/MPIContainer/SALOME_MPIContainer.cxx @@ -1,9 +1,9 @@ +#include #include #include "MPIContainer_i.hxx" #include "Utils_ORB_INIT.hxx" #include "Utils_SINGLETON.hxx" #include "utilities.h" -#include #include "SALOMETraceCollector.hxx" using namespace std; diff --git a/src/NamingService/SALOME_NamingService.cxx b/src/NamingService/SALOME_NamingService.cxx index a13dd52d4..ff670801b 100644 --- a/src/NamingService/SALOME_NamingService.cxx +++ b/src/NamingService/SALOME_NamingService.cxx @@ -543,6 +543,15 @@ SALOME_NamingService::ResolveComponent(const char* hostname, for (unsigned int ind = 0; ind < contList.size(); ind++) { name = contList[ind].c_str(); + + if ( nbproc >= 1 ){ + char *str_nbproc = new char[8]; + sprintf(str_nbproc, "_%d", nbproc); + if( strstr(name.c_str(),str_nbproc) == NULL) + continue; + delete [] str_nbproc; + } + name += "/"; name += componentName; SCRUTE(name); @@ -1458,15 +1467,16 @@ throw(ServiceUnreachable) void SALOME_NamingService::Destroy_FullDirectory(const char* Path) throw(ServiceUnreachable) { - Change_Directory(Path); - vector contList = list_directory(); + if( Change_Directory(Path) ){ + vector contList = list_directory(); - for (unsigned int ind = 0; ind < contList.size(); ind++) - Destroy_Name(contList[ind].c_str()); + for (unsigned int ind = 0; ind < contList.size(); ind++) + Destroy_Name(contList[ind].c_str()); - Destroy_Directory(Path); + Destroy_Directory(Path); - Destroy_Name(Path); + Destroy_Name(Path); + } } // ============================================================================ diff --git a/src/ResourcesManager/SALOME_ResourcesManager.cxx b/src/ResourcesManager/SALOME_ResourcesManager.cxx index 0a9aeeaea..384253f01 100644 --- a/src/ResourcesManager/SALOME_ResourcesManager.cxx +++ b/src/ResourcesManager/SALOME_ResourcesManager.cxx @@ -357,10 +357,12 @@ bool isPythonContainer(const char* ContainerName) string SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer (const string& machine, - const Engines::MachineParameters& params) + const Engines::MachineParameters& params,const long id) { string command; - + int nbproc; + char idc[64]; + if ( ! _isAppliSalomeDefined ) command = BuildTempFileToLaunchRemoteContainer(machine, params); @@ -370,8 +372,6 @@ SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer if (params.isMPI) { - int nbproc; - if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) ) nbproc = 1; else if ( params.nb_node == 0 ) @@ -418,9 +418,31 @@ SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer ASSERT(getenv("NSPORT")); command += getenv("NSPORT"); // port of CORBA name server - command += " SALOME_Container "; + if(params.isMPI){ + command += " mpirun -np "; + std::ostringstream o; + o << nbproc << " "; + command += o.str(); +#ifdef WITHLAM + command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#endif + command += " SALOME_MPIContainer "; + } + else + command += " SALOME_Container "; + command += _NS->ContainerName(params); + command += " -id "; + sprintf(idc,"%ld",id); + command += idc; + command += " -"; + AddOmninamesParams(command); + command += " > /tmp/"; command += _NS->ContainerName(params); - command += "&"; + command += "_"; + command += GetHostname(); + command += "_"; + command += getenv( "USER" ) ; + command += ".log 2>&1 &" ; MESSAGE("command =" << command); } @@ -437,11 +459,12 @@ SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer string SALOME_ResourcesManager::BuildCommandToLaunchLocalContainer -(const Engines::MachineParameters& params) +(const Engines::MachineParameters& params,const long id) { _TmpFileName = ""; string command; int nbproc = 0; + char idc[64]; if (params.isMPI) { @@ -461,7 +484,9 @@ SALOME_ResourcesManager::BuildCommandToLaunchLocalContainer o << nbproc << " "; command += o.str(); -// command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#ifdef WITHLAM + command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#endif if (isPythonContainer(params.container_name)) command += "pyMPI SALOME_ContainerPy.py "; @@ -478,6 +503,9 @@ SALOME_ResourcesManager::BuildCommandToLaunchLocalContainer } command += _NS->ContainerName(params); + command += " -id "; + sprintf(idc,"%ld",id); + command += idc; command += " -"; AddOmninamesParams(command); command += " > /tmp/"; @@ -771,6 +799,9 @@ SALOME_ResourcesManager::BuildTempFileToLaunchRemoteContainer std::ostringstream o; tempOutputFile << nbproc << " "; +#ifdef WITHLAM + tempOutputFile << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#endif } tempOutputFile << (*(resInfo.ModulesPath.find("KERNEL"))).second diff --git a/src/ResourcesManager/SALOME_ResourcesManager.hxx b/src/ResourcesManager/SALOME_ResourcesManager.hxx index 029c9fa9e..8c56d0783 100644 --- a/src/ResourcesManager/SALOME_ResourcesManager.hxx +++ b/src/ResourcesManager/SALOME_ResourcesManager.hxx @@ -53,10 +53,10 @@ class RESOURCESMANAGER_EXPORT SALOME_ResourcesManager std::string BuildCommandToLaunchRemoteContainer (const std::string& machine, - const Engines::MachineParameters& params); + const Engines::MachineParameters& params,const long id); std::string BuildCommandToLaunchLocalContainer - (const Engines::MachineParameters& params); + (const Engines::MachineParameters& params,const long id); void RmTmpFile(); -- 2.39.2