From: secher Date: Fri, 18 Dec 2009 14:20:01 +0000 (+0000) Subject: debug for CCRT X-Git-Url: http://git.salome-platform.org/gitweb/?a=commitdiff_plain;h=442a6f573b41ac365ceff706e2932de5dab389f0;p=modules%2Fkernel.git debug for CCRT --- diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index d68d7aef7..f59f7b814 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -44,6 +44,9 @@ using namespace std; const char *SALOME_ContainerManager::_ContainerManagerNameInNS = "/ContainerManager"; +omni_mutex SALOME_ContainerManager::_numInstanceMutex; + + //============================================================================= /*! * Constructor @@ -53,8 +56,7 @@ const char *SALOME_ContainerManager::_ContainerManagerNameInNS = */ //============================================================================= -SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, - SALOME_ResourcesManager *rm, SALOME_NamingService *ns) +SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, SALOME_ResourcesManager *rm, SALOME_NamingService *ns):_nbprocUsed(0) { MESSAGE("constructor"); _NS = ns; @@ -273,13 +275,27 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param } MESSAGE("[GiveContainer] Resource selected is: " << resource_selected); + _numInstanceMutex.lock(); + // Step 5: get container in the naming service Engines::ResourceDefinition_var resource_definition = _ResManager->GetResourceDefinition(resource_selected.c_str()); std::string hostname(resource_definition->name.in()); std::string containerNameInNS; - if(params.isMPI) + if(params.isMPI){ + int nbproc; + if ( (params.resource_params.nb_node <= 0) && (params.resource_params.nb_proc_per_node <= 0) ) + nbproc = 1; + else if ( params.resource_params.nb_node == 0 ) + nbproc = params.resource_params.nb_proc_per_node; + else if ( params.resource_params.nb_proc_per_node == 0 ) + nbproc = params.resource_params.nb_node; + else + nbproc = params.resource_params.nb_node * params.resource_params.nb_proc_per_node; + if( getenv("LIBBATCH_NODEFILE") != NULL ) + machinesFile(nbproc); // A mpi parallel container register on zero node in NS containerNameInNS = _NS->BuildContainerNameForNS(params, GetMPIZeroNode(hostname).c_str()); + } else containerNameInNS = _NS->BuildContainerNameForNS(params, hostname.c_str()); MESSAGE("[GiveContainer] Container name in the naming service: " << containerNameInNS); @@ -295,8 +311,10 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param Engines::Container_var cont=Engines::Container::_narrow(obj); if(!cont->_non_existent()) { - if(std::string(params.mode.in())=="getorstart" or std::string(params.mode.in())=="get") + if(std::string(params.mode.in())=="getorstart" or std::string(params.mode.in())=="get"){ + _numInstanceMutex.unlock(); return cont._retn(); /* the container exists and params.mode is getorstart or get use it*/ + } else { INFOS("[GiveContainer] A container is already registered with the name: " << containerNameInNS << ", shutdown the existing container"); @@ -318,6 +336,7 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param if (std::string(local_params.parallelLib.in()) != "") { INFOS("[GiveContainer] PaCO++ container are not currently available"); + _numInstanceMutex.unlock(); return ret; } // Classic or Exe ? @@ -331,6 +350,7 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param if (CORBA::is_nil (Catalog)) { INFOS("[GiveContainer] Module Catalog is not found -> cannot launch a container"); + _numInstanceMutex.unlock(); return ret; } // Loop through component list @@ -349,6 +369,7 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param if(found) { INFOS("ContainerManager Error: you can't have 2 CEXE component in the same container" ); + _numInstanceMutex.unlock(); return Engines::Container::_nil(); } MESSAGE("[GiveContainer] Exe container found !: " << container_exe_tmp); @@ -360,19 +381,26 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param catch (ServiceUnreachable&) { INFOS("Caught exception: Naming Service Unreachable"); + _numInstanceMutex.unlock(); return ret; } catch (...) { INFOS("Caught unknown exception."); + _numInstanceMutex.unlock(); return ret; } // Step 8: start a new container MESSAGE("[GiveContainer] Try to launch a new container on " << resource_selected); std::string command; - if(hostname == Kernel_Utils::GetHostname()) + // if a parallel container is launched in batch job, command is: "mpirun -np nbproc -machinefile nodesfile SALOME_MPIContainer" + if( getenv("LIBBATCH_NODEFILE") != NULL && params.isMPI ) + command = BuildCommandToLaunchLocalContainer(params,container_exe); + // if a container is launched on localhost, command is "SALOME_Container" or "mpirun -np nbproc SALOME_MPIContainer" + else if(hostname == Kernel_Utils::GetHostname()) command = BuildCommandToLaunchLocalContainer(params, container_exe); + // if a container is launched in remote mode, command is "ssh resource_selected SALOME_Container" or "ssh resource_selected mpirun -np nbproc SALOME_MPIContainer" else command = BuildCommandToLaunchRemoteContainer(resource_selected, params, container_exe); @@ -404,6 +432,8 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param // launch container with a system call int status=system(command.c_str()); + _numInstanceMutex.unlock(); + if (status == -1){ MESSAGE("SALOME_ContainerManager::StartContainer rsh failed (system command status -1)"); RmTmpFile(_TmpFileName); // command file can be removed here @@ -672,6 +702,9 @@ SALOME_ContainerManager::BuildCommandToLaunchLocalContainer o << nbproc << " "; + if( getenv("LIBBATCH_NODEFILE") != NULL ) + o << "-machinefile " << _machinesFile << " "; + #ifdef WITHLAM o << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; #elif defined(WITHOPENMPI) @@ -837,26 +870,6 @@ string SALOME_ContainerManager::BuildTemporaryFileName() const return aFileName; } -string SALOME_ContainerManager::GetMPIZeroNode(string machine) -{ - int status; - string zeronode; - string cmd; - string tmpFile = BuildTemporaryFileName(); - - cmd = "ssh " + machine + " mpirun -np 1 hostname > " + tmpFile; - - status = system(cmd.c_str()); - if( status == 0 ){ - ifstream fp(tmpFile.c_str(),ios::in); - fp >> zeronode; - } - - RmTmpFile(tmpFile); - - return zeronode; -} - //============================================================================= /*! * Builds in a temporary file the script to be launched. @@ -1688,3 +1701,48 @@ SALOME_ContainerManager::BuildCommandToLaunchParallelContainer(const std::string } #endif +string SALOME_ContainerManager::GetMPIZeroNode(string machine) +{ + int status; + string zeronode; + string cmd; + string tmpFile = BuildTemporaryFileName(); + + if( getenv("LIBBATCH_NODEFILE") == NULL ) + cmd = "ssh " + machine + " mpirun -np 1 hostname > " + tmpFile; + else + cmd = "mpirun -np 1 -machinefile " + _machinesFile + " hostname > " + tmpFile; + + status = system(cmd.c_str()); + if( status == 0 ){ + ifstream fp(tmpFile.c_str(),ios::in); + fp >> zeronode; + } + + RmTmpFile(tmpFile); + + return zeronode; +} + +void SALOME_ContainerManager::machinesFile(const int nbproc) +{ + string tmp; + string nodesFile = getenv("LIBBATCH_NODEFILE"); + _machinesFile = Kernel_Utils::GetTmpFileName(); + ifstream fpi(nodesFile.c_str(),ios::in); + ofstream fpo(_machinesFile.c_str(),ios::out); + + for(int i=0;i<_nbprocUsed;i++) + fpi >> tmp; + + for(int i=0;i> tmp ) + fpo << tmp << endl; + else + throw SALOME_Exception("You ask more processes than batch session have allocated!"); + + _nbprocUsed += nbproc; + fpi.close(); + fpo.close(); + +} diff --git a/src/Container/SALOME_ContainerManager.hxx b/src/Container/SALOME_ContainerManager.hxx index a0e7e19eb..30c76ef8b 100644 --- a/src/Container/SALOME_ContainerManager.hxx +++ b/src/Container/SALOME_ContainerManager.hxx @@ -88,6 +88,8 @@ protected: std::string GetMPIZeroNode(std::string machine); + void machinesFile(const int nbproc); + // For PacO++ Parallel extension typedef std::vector actual_launch_machine_t; std::string BuildCommandToLaunchParallelContainer(const std::string& exe_name, @@ -115,5 +117,13 @@ protected: //! different behaviour if $APPLI exists (SALOME Application) bool _isAppliSalomeDefined; + //! attribute that contains the number of processes used in batch mode by MPI containers + int _nbprocUsed; + + //! attributes that contains the machinefile for MPI containers + std::string _machinesFile; + + static omni_mutex _numInstanceMutex ; // lib and instance protection + }; #endif diff --git a/src/Launcher/Launcher_Job_SALOME.cxx b/src/Launcher/Launcher_Job_SALOME.cxx index 40808ab8c..84ab93569 100644 --- a/src/Launcher/Launcher_Job_SALOME.cxx +++ b/src/Launcher/Launcher_Job_SALOME.cxx @@ -80,7 +80,7 @@ Launcher::Job_SALOME::buildSalomeScript(Batch::Parametre params) launch_script_stream << "echo '' >> $CATALOG_FILE" << std::endl; launch_script_stream << "cat $LIBBATCH_NODEFILE | sort -u | while read host" << std::endl; launch_script_stream << "do" << std::endl; - launch_script_stream << "echo '> $CATALOG_FILE" << std::endl; launch_script_stream << "echo ' userName=\"" << _resource_definition.UserName << "\"' >> $CATALOG_FILE" << std::endl; launch_script_stream << "echo ' appliPath=\"" << _resource_definition.AppliPath << "\"' >> $CATALOG_FILE" << std::endl;