const char *SALOME_ContainerManager::_ContainerManagerNameInNS =
"/ContainerManager";
+omni_mutex SALOME_ContainerManager::_numInstanceMutex;
+
+
//=============================================================================
/*!
* Constructor
*/
//=============================================================================
-SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa,
- SALOME_ResourcesManager *rm, SALOME_NamingService *ns)
+SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, SALOME_ResourcesManager *rm, SALOME_NamingService *ns):_nbprocUsed(0)
{
MESSAGE("constructor");
_NS = ns;
}
MESSAGE("[GiveContainer] Resource selected is: " << resource_selected);
+ _numInstanceMutex.lock();
+
// Step 5: get container in the naming service
Engines::ResourceDefinition_var resource_definition = _ResManager->GetResourceDefinition(resource_selected.c_str());
std::string hostname(resource_definition->name.in());
std::string containerNameInNS;
- if(params.isMPI)
+ if(params.isMPI){
+ int nbproc;
+ if ( (params.resource_params.nb_node <= 0) && (params.resource_params.nb_proc_per_node <= 0) )
+ nbproc = 1;
+ else if ( params.resource_params.nb_node == 0 )
+ nbproc = params.resource_params.nb_proc_per_node;
+ else if ( params.resource_params.nb_proc_per_node == 0 )
+ nbproc = params.resource_params.nb_node;
+ else
+ nbproc = params.resource_params.nb_node * params.resource_params.nb_proc_per_node;
+ if( getenv("LIBBATCH_NODEFILE") != NULL )
+ machinesFile(nbproc);
// A mpi parallel container register on zero node in NS
containerNameInNS = _NS->BuildContainerNameForNS(params, GetMPIZeroNode(hostname).c_str());
+ }
else
containerNameInNS = _NS->BuildContainerNameForNS(params, hostname.c_str());
MESSAGE("[GiveContainer] Container name in the naming service: " << containerNameInNS);
Engines::Container_var cont=Engines::Container::_narrow(obj);
if(!cont->_non_existent())
{
- if(std::string(params.mode.in())=="getorstart" or std::string(params.mode.in())=="get")
+ if(std::string(params.mode.in())=="getorstart" or std::string(params.mode.in())=="get"){
+ _numInstanceMutex.unlock();
return cont._retn(); /* the container exists and params.mode is getorstart or get use it*/
+ }
else
{
INFOS("[GiveContainer] A container is already registered with the name: " << containerNameInNS << ", shutdown the existing container");
if (std::string(local_params.parallelLib.in()) != "")
{
INFOS("[GiveContainer] PaCO++ container are not currently available");
+ _numInstanceMutex.unlock();
return ret;
}
// Classic or Exe ?
if (CORBA::is_nil (Catalog))
{
INFOS("[GiveContainer] Module Catalog is not found -> cannot launch a container");
+ _numInstanceMutex.unlock();
return ret;
}
// Loop through component list
if(found)
{
INFOS("ContainerManager Error: you can't have 2 CEXE component in the same container" );
+ _numInstanceMutex.unlock();
return Engines::Container::_nil();
}
MESSAGE("[GiveContainer] Exe container found !: " << container_exe_tmp);
catch (ServiceUnreachable&)
{
INFOS("Caught exception: Naming Service Unreachable");
+ _numInstanceMutex.unlock();
return ret;
}
catch (...)
{
INFOS("Caught unknown exception.");
+ _numInstanceMutex.unlock();
return ret;
}
// Step 8: start a new container
MESSAGE("[GiveContainer] Try to launch a new container on " << resource_selected);
std::string command;
- if(hostname == Kernel_Utils::GetHostname())
+ // if a parallel container is launched in batch job, command is: "mpirun -np nbproc -machinefile nodesfile SALOME_MPIContainer"
+ if( getenv("LIBBATCH_NODEFILE") != NULL && params.isMPI )
+ command = BuildCommandToLaunchLocalContainer(params,container_exe);
+ // if a container is launched on localhost, command is "SALOME_Container" or "mpirun -np nbproc SALOME_MPIContainer"
+ else if(hostname == Kernel_Utils::GetHostname())
command = BuildCommandToLaunchLocalContainer(params, container_exe);
+ // if a container is launched in remote mode, command is "ssh resource_selected SALOME_Container" or "ssh resource_selected mpirun -np nbproc SALOME_MPIContainer"
else
command = BuildCommandToLaunchRemoteContainer(resource_selected, params, container_exe);
// launch container with a system call
int status=system(command.c_str());
+ _numInstanceMutex.unlock();
+
if (status == -1){
MESSAGE("SALOME_ContainerManager::StartContainer rsh failed (system command status -1)");
RmTmpFile(_TmpFileName); // command file can be removed here
o << nbproc << " ";
+ if( getenv("LIBBATCH_NODEFILE") != NULL )
+ o << "-machinefile " << _machinesFile << " ";
+
#ifdef WITHLAM
o << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
#elif defined(WITHOPENMPI)
return aFileName;
}
-string SALOME_ContainerManager::GetMPIZeroNode(string machine)
-{
- int status;
- string zeronode;
- string cmd;
- string tmpFile = BuildTemporaryFileName();
-
- cmd = "ssh " + machine + " mpirun -np 1 hostname > " + tmpFile;
-
- status = system(cmd.c_str());
- if( status == 0 ){
- ifstream fp(tmpFile.c_str(),ios::in);
- fp >> zeronode;
- }
-
- RmTmpFile(tmpFile);
-
- return zeronode;
-}
-
//=============================================================================
/*!
* Builds in a temporary file the script to be launched.
}
#endif
+string SALOME_ContainerManager::GetMPIZeroNode(string machine)
+{
+ int status;
+ string zeronode;
+ string cmd;
+ string tmpFile = BuildTemporaryFileName();
+
+ if( getenv("LIBBATCH_NODEFILE") == NULL )
+ cmd = "ssh " + machine + " mpirun -np 1 hostname > " + tmpFile;
+ else
+ cmd = "mpirun -np 1 -machinefile " + _machinesFile + " hostname > " + tmpFile;
+
+ status = system(cmd.c_str());
+ if( status == 0 ){
+ ifstream fp(tmpFile.c_str(),ios::in);
+ fp >> zeronode;
+ }
+
+ RmTmpFile(tmpFile);
+
+ return zeronode;
+}
+
+void SALOME_ContainerManager::machinesFile(const int nbproc)
+{
+ string tmp;
+ string nodesFile = getenv("LIBBATCH_NODEFILE");
+ _machinesFile = Kernel_Utils::GetTmpFileName();
+ ifstream fpi(nodesFile.c_str(),ios::in);
+ ofstream fpo(_machinesFile.c_str(),ios::out);
+
+ for(int i=0;i<_nbprocUsed;i++)
+ fpi >> tmp;
+
+ for(int i=0;i<nbproc;i++)
+ if( fpi >> tmp )
+ fpo << tmp << endl;
+ else
+ throw SALOME_Exception("You ask more processes than batch session have allocated!");
+
+ _nbprocUsed += nbproc;
+ fpi.close();
+ fpo.close();
+
+}