#include <vector>
#include "Utils_CorbaException.hxx"
#include "Batch_Date.hxx"
+#include <sstream>
#ifdef WITH_PACO_PARALLEL
-#include "PaCO++.h"
+#include "PaCOPP.hxx"
#endif
#define TIME_OUT_TO_LAUNCH_CONT 61
Engines::ContainerManager::_narrow(obj);
_NS->Register(refContMan,_ContainerManagerNameInNS);
- _MpiStarted = false;
_isAppliSalomeDefined = (getenv("APPLI") != 0);
+
+#ifdef HAVE_MPI2
+ if( getenv("OMPI_URI_FILE") != NULL ){
+ system("killall ompi-server");
+ string command;
+ command = "ompi-server -r ";
+ command += getenv("OMPI_URI_FILE");
+ int status=system(command.c_str());
+ if(status!=0)
+ throw SALOME_Exception("Error when launching ompi-server");
+ }
+#endif
+
MESSAGE("constructor end");
}
SALOME_ContainerManager::~SALOME_ContainerManager()
{
MESSAGE("destructor");
+#ifdef HAVE_MPI2
+ if( getenv("OMPI_URI_FILE") != NULL )
+ system("killall ompi-server");
+#endif
}
//=============================================================================
+//! shutdown all the containers, then the ContainerManager servant
/*! CORBA method:
- * shutdown all the containers, then the ContainerManager servant
*/
//=============================================================================
}
//=============================================================================
+//! Loop on all the containers listed in naming service, ask shutdown on each
/*! CORBA Method:
- * Loop on all the containers listed in naming service, ask shutdown on each
*/
//=============================================================================
for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++){
SCRUTE((*iter));
}
- for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++){
- SCRUTE((*iter));
- CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
- Engines::Container_var cont=Engines::Container::_narrow(obj);
- if(!CORBA::is_nil(cont))
- {
- MESSAGE("ShutdownContainers: " << (*iter));
- try
- {
- cont->Shutdown();
- }
- catch(CORBA::SystemException& e)
- {
- INFOS("CORBA::SystemException ignored : " << e);
- }
- catch(CORBA::Exception&)
- {
- INFOS("CORBA::Exception ignored.");
- }
- catch(...)
- {
- INFOS("Unknown exception ignored.");
- }
- }
- else
- MESSAGE("ShutdownContainers: no container ref for " << (*iter));
+ for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++)
+ {
+ try
+ {
+ SCRUTE((*iter));
+ CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
+ Engines::Container_var cont=Engines::Container::_narrow(obj);
+ if(!CORBA::is_nil(cont))
+ {
+ MESSAGE("ShutdownContainers: " << (*iter));
+ cont->Shutdown();
+ }
+ else
+ MESSAGE("ShutdownContainers: no container ref for " << (*iter));
+ }
+ catch(CORBA::SystemException& e)
+ {
+ INFOS("CORBA::SystemException ignored : " << e);
+ }
+ catch(CORBA::Exception&)
+ {
+ INFOS("CORBA::Exception ignored.");
+ }
+ catch(...)
+ {
+ INFOS("Unknown exception ignored.");
+ }
}
}
}
//=============================================================================
-//! Find a suitable Container in a list of machines, or start one
+//! Give a suitable Container given constraints
/*! CORBA Method:
* \param params Machine Parameters required for the container
- * \param possibleComputers list of machines usable for find or start
+ * \return the container or nil
*/
//=============================================================================
Engines::Container_ptr
-SALOME_ContainerManager::
-FindOrStartContainer(const Engines::MachineParameters& params,
- const Engines::MachineList& possibleComputers)
+SALOME_ContainerManager::GiveContainer(const Engines::MachineParameters& params)
{
- Engines::Container_ptr ret = FindContainer(params,possibleComputers);
- if(!CORBA::is_nil(ret))
- return ret;
- MESSAGE("Container doesn't exist try to launch it ...");
+ char *valenv=getenv("SALOME_BATCH");
+ if(valenv)
+ if (strcmp(valenv,"1")==0)
+ {
+ if(_batchLaunchedContainers.empty())
+ fillBatchLaunchedContainers();
- return StartContainer(params,possibleComputers,Engines::P_FIRST);
+ if (_batchLaunchedContainersIter == _batchLaunchedContainers.end())
+ _batchLaunchedContainersIter = _batchLaunchedContainers.begin();
+ Engines::Container_ptr rtn = Engines::Container::_duplicate(*_batchLaunchedContainersIter);
+ _batchLaunchedContainersIter++;
+ return rtn;
+ }
+ return StartContainer(params);
}
//=============================================================================
-//! Start a suitable Container in a list of machines with constraints and a policy
+//! Start a suitable Container in a list of machines with constraints
/*! C++ Method:
* Constraints are given by a machine parameters struct
* \param params Machine Parameters required for the container
* \param possibleComputers list of machines usable for start
- * \param policy policy to use (first,cycl or best)
* \param container_exe specific container executable (default=SALOME_Container)
*/
//=============================================================================
Engines::Container_ptr
-SALOME_ContainerManager::
-StartContainer(const Engines::MachineParameters& params,
+SALOME_ContainerManager::StartContainer(const Engines::MachineParameters& params,
const Engines::MachineList& possibleComputers,
- Engines::ResPolicy policy,const std::string& container_exe)
+ const std::string& container_exe)
{
#ifdef WITH_PACO_PARALLEL
std::string parallelLib(params.parallelLib);
if (parallelLib != "")
- return FindOrStartParallelContainer(params, possibleComputers);
+ {
+ Engines::MachineParameters myparams(params);
+ myparams.computerList=possibleComputers;
+ return StartParallelContainer(myparams);
+ }
#endif
string containerNameInNS;
Engines::Container_ptr ret = Engines::Container::_nil();
//check if an entry exists in Naming service
//if params.mode == "start" or "" shutdown the existing container before launching a new one with that name
//if params.mode == "getorstart" or "get" use the existing container
- containerNameInNS = _NS->BuildContainerNameForNS(params,theMachine.c_str());
+ if(params.isMPI)
+ // A parallel container register on zero node in NS
+ containerNameInNS = _NS->BuildContainerNameForNS(params,GetMPIZeroNode(theMachine).c_str());
+ else
+ containerNameInNS = _NS->BuildContainerNameForNS(params,theMachine.c_str());
SCRUTE(containerNameInNS);
CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
if(std::string(params.mode.in())=="getorstart"||std::string(params.mode.in())=="get")
return cont._retn(); /* the container exists and params.mode is getorstart or get use it*/
else
- cont->Shutdown(); // shutdown the registered container if it exists
+ {
+ INFOS("A container is already registered with the name: " << containerNameInNS << ", shutdown the existing container");
+ cont->Shutdown(); // shutdown the registered container if it exists
+ }
}
}
catch(CORBA::Exception&)
command = BuildCommandToLaunchRemoteContainer(theMachine,params,container_exe);
//redirect stdout and stderr in a file
- string logFilename="/tmp/"+_NS->ContainerName(params)+"_"+ theMachine +"_"+getenv( "USER" )+".log" ;
- command += " > " + logFilename + " 2>&1 &";
+#ifdef WNT
+ string logFilename=getenv("TEMP");
+ logFilename += "\\";
+#else
+ string logFilename="/tmp";
+ char* val = getenv("SALOME_TMP_DIR");
+ if(val)
+ {
+ struct stat file_info;
+ stat(val, &file_info);
+ bool is_dir = S_ISDIR(file_info.st_mode);
+ if (is_dir)logFilename=val;
+ else std::cerr << "SALOME_TMP_DIR environment variable is not a directory use /tmp instead" << std::endl;
+ }
+ logFilename += "/";
+#endif
+ logFilename += _NS->ContainerName(params)+"_"+ theMachine +"_"+getenv( "USER" )+".log" ;
+ command += " > " + logFilename + " 2>&1";
+#ifdef WNT
+ command = "%PYTHONBIN% -c \"import win32pm ; win32pm.spawnpid(r'" + command + "', '')\"";
+#else
+ command += " &";
+#endif
// launch container with a system call
int status=system(command.c_str());
}
//=============================================================================
-//! Start a suitable Container for a list of components with constraints and a policy
+//! Start a suitable Container given constraints
/*! CORBA Method:
* \param params Machine Parameters required for the container
- * \param policy policy to use (first,cycl or best)
- * \param componentList list of component to be loaded on this container
*/
//=============================================================================
Engines::Container_ptr
-SALOME_ContainerManager::
-StartContainer(const Engines::MachineParameters& params,
- Engines::ResPolicy policy,
- const Engines::CompoList& componentList)
+SALOME_ContainerManager::StartContainer(const Engines::MachineParameters& params)
{
- Engines::MachineList_var possibleComputers = _ResManager->GetFittingResources(params,componentList);
+ Engines::MachineList_var possibleComputers = _ResManager->GetFittingResources(params);
// Look into ModulCatalog if a specific container must be launched
CORBA::String_var container_exe;
if (CORBA::is_nil (Catalog))
return Engines::Container::_nil();
// Loop through component list
- for(unsigned int i=0;i<componentList.length();i++)
+ for(unsigned int i=0;i<params.componentList.length();i++)
{
- const char* compoi = componentList[i];
+ const char* compoi = params.componentList[i];
SALOME_ModuleCatalog::Acomponent_var compoInfo = Catalog->GetComponent(compoi);
if (CORBA::is_nil (compoInfo))
{
}
if(found)
- return StartContainer(params,possibleComputers,policy,container_exe.in());
+ return StartContainer(params,possibleComputers,container_exe.in());
else
- return StartContainer(params,possibleComputers,policy);
+ return StartContainer(params,possibleComputers);
}
-#ifdef WITH_PACO_PARALLEL
//=============================================================================
+//! Find or start a suitable Container given some constraints
/*! CORBA Method:
- * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
* \param params Machine Parameters required for the container
- * \param possibleComputers list of machines usable for find or start
- *
- * \return CORBA container reference.
+ * \return the container or nil
*/
//=============================================================================
-Engines::Container_ptr
-SALOME_ContainerManager::
-FindOrStartParallelContainer(const Engines::MachineParameters& params_const,
- const Engines::MachineList& possibleComputers)
-{
- CORBA::Object_var obj;
- PaCO::InterfaceManager_var proxy;
- Engines::Container_ptr ret = Engines::Container::_nil();
- Engines::MachineParameters params(params_const);
- // Step 1 : Try to find a suitable container
- // Currently not as good as could be since
- // we have to verified the number of nodes of the container
- // if a user tell that.
- ret = FindContainer(params, possibleComputers);
-
- if(CORBA::is_nil(ret)) {
- // Step 2 : Starting a new parallel container
- INFOS("[FindOrStartParallelContainer] Starting a parallel container");
-
- // Step 2.1 : Choose a computer
- string theMachine = _ResManager->FindFirst(possibleComputers);
- if(theMachine == "") {
- INFOS("[FindOrStartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
- INFOS("[FindOrStartParallelContainer] No possible computer found");
- INFOS("[FindOrStartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
- }
- else {
- INFOS("[FindOrStartParallelContainer] on machine : " << theMachine);
- string command;
- if(theMachine == Kernel_Utils::GetHostname()) {
- // Step 3 : starting parallel container proxy
- params.hostname = CORBA::string_dup(theMachine.c_str());
- Engines::MachineParameters params_proxy(params);
- try {
- command = BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerProxy", params_proxy, "xterm");
- }
- catch(const SALOME_Exception & ex){
- MESSAGE(ex.what());
- return Engines::Container::_nil();
- }
- // LaunchParallelContainer uses this value to know if it launches the proxy or the nodes
- params_proxy.nb_component_nodes = 0;
- obj = LaunchParallelContainer(command, params_proxy, _NS->ContainerName(params));
- ret = Engines::Container::_narrow(obj);
- proxy = PaCO::InterfaceManager::_narrow(obj);
-
- // Step 4 : starting parallel container nodes
- command = BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerNode", params, "xterm");
- string name = _NS->ContainerName(params) + "Node";
- LaunchParallelContainer(command, params, name);
- // Step 5 : connecting nodes and the proxy to actually create a parallel container
- try {
- for (int i = 0; i < params.nb_component_nodes; i++) {
-
- char buffer [5];
-#ifndef WIN32
- snprintf(buffer,5,"%d",i);
-#else
- _snprintf(buffer,5,"%d",i);
-#endif
- string name_cont = name + string(buffer);
-
- string theNodeMachine(CORBA::string_dup(params.hostname));
- string containerNameInNS = _NS->BuildContainerNameForNS(name_cont.c_str(),theNodeMachine.c_str());
- int count = TIME_OUT_TO_LAUNCH_CONT;
- obj = _NS->Resolve(containerNameInNS.c_str());
- while (CORBA::is_nil(obj) && count) {
- INFOS("[FindOrStartParallelContainer] CONNECTION FAILED !!!!!!!!!!!!!!!!!!!!!!!!");
-#ifndef WIN32
- sleep(1) ;
-#else
- Sleep(1000);
-#endif
- count-- ;
- obj = _NS->Resolve(containerNameInNS.c_str());
- }
-
- PaCO::InterfaceParallel_var node = PaCO::InterfaceParallel::_narrow(obj);
- MESSAGE("[FindOrStartParallelContainer] Deploying node : " << name);
- node->deploy();
- }
- proxy->start();
- }
- catch(CORBA::SystemException& e)
- {
- INFOS("Caught CORBA::SystemException. : " << e);
- }
- catch(PortableServer::POA::ServantAlreadyActive&)
- {
- INFOS("Caught CORBA::ServantAlreadyActiveException");
- }
- catch(CORBA::Exception&)
- {
- INFOS("Caught CORBA::Exception.");
- }
- catch(std::exception& exc)
- {
- INFOS("Caught std::exception - "<<exc.what());
- }
- catch(...)
- {
- INFOS("Caught unknown exception.");
- }
- INFOS("[FindOrStartParallelContainer] node " << name << " deployed");
- }
- else {
- INFOS("[FindOrStartParallelContainer] Currently parallel containers are launched only on the local host");
- }
- }
- }
- return ret;
-}
-#else
-//=============================================================================
-/*! CORBA Method:
- * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
- * \param params Machine Parameters required for the container
- * \param possibleComputers list of machines usable for find or start
- *
- * \return CORBA container reference.
- */
-//=============================================================================
Engines::Container_ptr
-SALOME_ContainerManager::
-FindOrStartParallelContainer(const Engines::MachineParameters& params,
- const Engines::MachineList& possibleComputers)
+SALOME_ContainerManager::FindOrStartContainer(const Engines::MachineParameters& params)
{
- Engines::Container_ptr ret = Engines::Container::_nil();
- INFOS("[FindOrStartParallelContainer] is disabled !");
- INFOS("[FindOrStartParallelContainer] recompile SALOME Kernel to enable parallel extension");
- return ret;
+ Engines::Container_ptr ret = FindContainer(params,params.computerList);
+ if(!CORBA::is_nil(ret))
+ return ret;
+ MESSAGE("Container doesn't exist try to launch it ...");
+
+ return StartContainer(params);
}
-#endif
//=============================================================================
-//! Give a suitable Container for a list of components with constraints and a policy
-/*! CORBA Method:
- * \param params Machine Parameters required for the container
- * \param policy policy to use (first,cycl or best)
- * \param componentList list of component to be loaded on this container
+//! Find a container given constraints (params) on a list of machines (possibleComputers)
+/*!
+ *
*/
//=============================================================================
Engines::Container_ptr
-SALOME_ContainerManager::
-GiveContainer(const Engines::MachineParameters& params,
- Engines::ResPolicy policy,
- const Engines::CompoList& componentList)
+SALOME_ContainerManager::FindContainer(const Engines::MachineParameters& params,
+ const Engines::MachineList& possibleComputers)
{
- char *valenv=getenv("SALOME_BATCH");
- if(valenv)
- if (strcmp(valenv,"1")==0)
- {
- if(_batchLaunchedContainers.empty())
- fillBatchLaunchedContainers();
-
- if (_batchLaunchedContainersIter == _batchLaunchedContainers.end())
- _batchLaunchedContainersIter = _batchLaunchedContainers.begin();
-
- Engines::Container_ptr rtn = Engines::Container::_duplicate(*_batchLaunchedContainersIter);
- _batchLaunchedContainersIter++;
- return rtn;
- }
- return StartContainer(params,policy,componentList);
+ MESSAGE("FindContainer "<<possibleComputers.length());
+ for(unsigned int i=0;i<possibleComputers.length();i++)
+ {
+ MESSAGE("FindContainer possible " << possibleComputers[i]);
+ Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
+ if( !CORBA::is_nil(cont) )
+ return cont;
+ }
+ MESSAGE("FindContainer: not found");
+ return Engines::Container::_nil();
}
//=============================================================================
-/*!
- *
+//! Find a container given constraints (params) on a machine (theMachine)
+/*!
+ *
*/
//=============================================================================
Engines::Container_ptr
-SALOME_ContainerManager::
-FindContainer(const Engines::MachineParameters& params,
- const char *theMachine)
+SALOME_ContainerManager::FindContainer(const Engines::MachineParameters& params,
+ const char *theMachine)
{
string containerNameInNS(_NS->BuildContainerNameForNS(params,theMachine));
CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
}
}
+#ifdef WITH_PACO_PARALLEL
//=============================================================================
-/*!
- *
+/*! CORBA Method:
+ * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
+ * \param params Machine Parameters required for the container
+ * \return CORBA container reference.
*/
//=============================================================================
-
Engines::Container_ptr
-SALOME_ContainerManager::
-FindContainer(const Engines::MachineParameters& params,
- const Engines::MachineList& possibleComputers)
+SALOME_ContainerManager::StartParallelContainer(const Engines::MachineParameters& params_const)
{
- MESSAGE("FindContainer "<<possibleComputers.length());
- for(unsigned int i=0;i<possibleComputers.length();i++)
+ CORBA::Object_var obj;
+ PaCO::InterfaceManager_var container_proxy;
+ Engines::Container_ptr ret = Engines::Container::_nil();
+ Engines::MachineParameters params(params_const);
+
+ // Step 1 : Try to find a suitable container
+ // Currently not as good as could be since
+ // we have to verified the number of nodes of the container
+ // if a user tell that.
+ ret = FindContainer(params, params.computerList);
+ if(CORBA::is_nil(ret)) {
+ // Step 2 : Starting a new parallel container !
+ INFOS("[StartParallelContainer] Starting a PaCO++ parallel container");
+
+ // Step 3 : Choose a computer
+ std::string theMachine = _ResManager->FindFirst(params.computerList);
+ //If the machine name is localhost use the real name
+ if(theMachine == "localhost")
+ theMachine=Kernel_Utils::GetHostname();
+
+ if(theMachine == "") {
+ INFOS("[StartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
+ INFOS("[StartParallelContainer] No possible computer found");
+ INFOS("[StartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
+ return ret;
+ }
+ INFOS("[StartParallelContainer] on machine : " << theMachine);
+ params.hostname = CORBA::string_dup(theMachine.c_str());
+
+ // Step 4 : starting parallel container proxy
+ Engines::MachineParameters params_proxy(params);
+ std::string command_proxy;
+ SALOME_ContainerManager::actual_launch_machine_t proxy_machine;
+ try
{
- MESSAGE("FindContainer possible " << possibleComputers[i]);
- Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
- if( !CORBA::is_nil(cont) )
- return cont;
+ command_proxy = BuildCommandToLaunchParallelContainer("SALOME_ParallelContainerProxy", params_proxy, proxy_machine);
}
- MESSAGE("FindContainer: not found");
- return Engines::Container::_nil();
+ catch(const SALOME_Exception & ex)
+ {
+ INFOS("[StartParallelContainer] Exception in BuildCommandToLaunchParallelContainer");
+ INFOS(ex.what());
+ return ret;
+ }
+ params_proxy.nb_component_nodes = 0; // LaunchParallelContainer uses this value to know if it launches the proxy or the nodes
+ obj = LaunchParallelContainer(command_proxy, params_proxy, _NS->ContainerName(params_proxy), proxy_machine);
+ if (CORBA::is_nil(obj))
+ {
+ INFOS("[StartParallelContainer] LaunchParallelContainer for proxy returns NIL !");
+ return ret;
+ }
+ try
+ {
+ container_proxy = PaCO::InterfaceManager::_narrow(obj);
+ }
+ catch(CORBA::SystemException& e)
+ {
+ INFOS("[StartParallelContainer] Exception in _narrow after LaunchParallelContainer for proxy !");
+ INFOS("CORBA::SystemException : " << e);
+ return ret;
+ }
+ catch(CORBA::Exception& e)
+ {
+ INFOS("[StartParallelContainer] Exception in _narrow after LaunchParallelContainer for proxy !");
+ INFOS("CORBA::Exception" << e);
+ return ret;
+ }
+ catch(...)
+ {
+ INFOS("[StartParallelContainer] Exception in _narrow after LaunchParallelContainer for proxy !");
+ INFOS("Unknown exception !");
+ return ret;
+ }
+ if (CORBA::is_nil(container_proxy))
+ {
+ INFOS("[StartParallelContainer] PaCO::InterfaceManager::_narrow returns NIL !");
+ return ret;
+ }
+
+ // Step 5 : starting parallel container nodes
+ std::string command_nodes;
+ Engines::MachineParameters params_nodes(params);
+ SALOME_ContainerManager::actual_launch_machine_t nodes_machines;
+ try
+ {
+ command_nodes = BuildCommandToLaunchParallelContainer("SALOME_ParallelContainerNode", params_nodes, nodes_machines, proxy_machine[0]);
+ }
+ catch(const SALOME_Exception & ex)
+ {
+ INFOS("[StartParallelContainer] Exception in BuildCommandToLaunchParallelContainer");
+ INFOS(ex.what());
+ return ret;
+ }
+ std::string container_generic_node_name = _NS->ContainerName(params) + "Node";
+ obj = LaunchParallelContainer(command_nodes, params_nodes, container_generic_node_name, nodes_machines);
+ if (CORBA::is_nil(obj))
+ {
+ INFOS("[StartParallelContainer] LaunchParallelContainer for nodes returns NIL !");
+ // Il faut tuer le proxy
+ try
+ {
+ Engines::Container_var proxy = Engines::Container::_narrow(container_proxy);
+ proxy->Shutdown();
+ }
+ catch (...)
+ {
+ INFOS("[StartParallelContainer] Exception catched from proxy Shutdown...");
+ }
+ return ret;
+ }
+
+ // Step 6 : connecting nodes and the proxy to actually create a parallel container
+ for (int i = 0; i < params.nb_component_nodes; i++)
+ {
+ std::ostringstream tmp;
+ tmp << i;
+ std::string proc_number = tmp.str();
+ std::string container_node_name = container_generic_node_name + proc_number;
+
+ std::string theNodeMachine(nodes_machines[i]);
+ std::string containerNameInNS = _NS->BuildContainerNameForNS(container_node_name.c_str(), theNodeMachine.c_str());
+ obj = _NS->Resolve(containerNameInNS.c_str());
+ if (CORBA::is_nil(obj))
+ {
+ INFOS("[StartParallelContainer] CONNECTION FAILED From Naming Service !");
+ INFOS("[StartParallelContainer] Container name is " << containerNameInNS);
+ return ret;
+ }
+ try
+ {
+ MESSAGE("[StartParallelContainer] Deploying node : " << container_node_name);
+ PaCO::InterfaceParallel_var node = PaCO::InterfaceParallel::_narrow(obj);
+ node->deploy();
+ MESSAGE("[StartParallelContainer] node " << container_node_name << " is deployed");
+ }
+ catch(CORBA::SystemException& e)
+ {
+ INFOS("[StartParallelContainer] Exception in deploying node : " << containerNameInNS);
+ INFOS("CORBA::SystemException : " << e);
+ return ret;
+ }
+ catch(CORBA::Exception& e)
+ {
+ INFOS("[StartParallelContainer] Exception in deploying node : " << containerNameInNS);
+ INFOS("CORBA::Exception" << e);
+ return ret;
+ }
+ catch(...)
+ {
+ INFOS("[StartParallelContainer] Exception in deploying node : " << containerNameInNS);
+ INFOS("Unknown exception !");
+ return ret;
+ }
+ }
+
+ // Step 7 : starting parallel container
+ try
+ {
+ MESSAGE ("[StartParallelContainer] Starting parallel object");
+ container_proxy->start();
+ MESSAGE ("[StartParallelContainer] Parallel object is started");
+ ret = Engines::Container::_narrow(container_proxy);
+ }
+ catch(CORBA::SystemException& e)
+ {
+ INFOS("Caught CORBA::SystemException. : " << e);
+ }
+ catch(PortableServer::POA::ServantAlreadyActive&)
+ {
+ INFOS("Caught CORBA::ServantAlreadyActiveException");
+ }
+ catch(CORBA::Exception&)
+ {
+ INFOS("Caught CORBA::Exception.");
+ }
+ catch(std::exception& exc)
+ {
+ INFOS("Caught std::exception - "<<exc.what());
+ }
+ catch(...)
+ {
+ INFOS("Caught unknown exception.");
+ }
+ }
+ return ret;
+}
+#else
+//=============================================================================
+/*! CORBA Method:
+ * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
+ * \param params Machine Parameters required for the container
+ * \return CORBA container reference.
+ */
+//=============================================================================
+Engines::Container_ptr
+SALOME_ContainerManager::StartParallelContainer(const Engines::MachineParameters& params)
+{
+ Engines::Container_ptr ret = Engines::Container::_nil();
+ INFOS("[StartParallelContainer] is disabled !");
+ INFOS("[StartParallelContainer] recompile SALOME Kernel to enable parallel extension");
+ return ret;
}
+#endif
//=============================================================================
/*! This method launches the parallel container.
CORBA::Object_ptr
SALOME_ContainerManager::LaunchParallelContainer(const std::string& command,
const Engines::MachineParameters& params,
- const std::string& name)
+ const std::string& name,
+ SALOME_ContainerManager::actual_launch_machine_t & vect_machine)
{
CORBA::Object_ptr obj = CORBA::Object::_nil();
- string containerNameInNS;
- MESSAGE("[LaunchParallelContainer] : command to launch...");
- MESSAGE(command);
- if (params.nb_component_nodes == 0) {
- INFOS("[LaunchParallelContainer] launching the proxy of the parallel container");
- int status = system(command.c_str());
- if (status == -1) {
- INFOS("[LaunchParallelContainer] failed : system command status -1");
- }
- else if (status == 217) {
- INFOS("[LaunchParallelContainer] failed : system command status 217");
- }
-
- int count = TIME_OUT_TO_LAUNCH_CONT;
- string theMachine(CORBA::string_dup(params.hostname));
- containerNameInNS = _NS->BuildContainerNameForNS((char*) name.c_str(),theMachine.c_str());
+ std::string containerNameInNS;
+ int count = TIME_OUT_TO_LAUNCH_CONT;
+
+ INFOS("[LaunchParallelContainer] Begin");
+ int status = system(command.c_str());
+ if (status == -1) {
+ INFOS("[LaunchParallelContainer] failed : system command status -1");
+ return obj;
+ }
+ else if (status == 217) {
+ INFOS("[LaunchParallelContainer] failed : system command status 217");
+ return obj;
+ }
- INFOS("[LaunchParallelContainer] Waiting for Parallel Container proxy on " << theMachine);
- while (CORBA::is_nil(obj) && count) {
+ if (params.nb_component_nodes == 0)
+ {
+ std::string theMachine(vect_machine[0]);
+ // Proxy We have launch a proxy
+ containerNameInNS = _NS->BuildContainerNameForNS((char*) name.c_str(), theMachine.c_str());
+ INFOS("[LaunchParallelContainer] Waiting for Parallel Container proxy " << containerNameInNS << " on " << theMachine);
+ while (CORBA::is_nil(obj) && count)
+ {
#ifndef WIN32
sleep(1) ;
#else
obj = _NS->Resolve(containerNameInNS.c_str());
}
}
- else {
+ else
+ {
INFOS("[LaunchParallelContainer] launching the nodes of the parallel container");
- int status = system(command.c_str());
- if (status == -1) {
- INFOS("[LaunchParallelContainer] failed : system command status -1");
- }
- else if (status == 217) {
- INFOS("[LaunchParallelContainer] failed : system command status 217");
- }
// We are waiting all the nodes
- for (int i = 0; i < params.nb_component_nodes; i++) {
+ for (int i = 0; i < params.nb_component_nodes; i++)
+ {
obj = CORBA::Object::_nil();
- int count = TIME_OUT_TO_LAUNCH_CONT;
-
+ std::string theMachine(vect_machine[i]);
// Name of the node
- char buffer [5];
-#ifndef WIN32
- snprintf(buffer,5,"%d",i);
-#else
- _snprintf(buffer,5,"%d",i);
-#endif
-
- string name_cont = name + string(buffer);
-
- // I don't like this...
- string theMachine(CORBA::string_dup(params.hostname));
- containerNameInNS = _NS->BuildContainerNameForNS((char*) name_cont.c_str(),theMachine.c_str());
- cerr << "[LaunchContainer] Waiting for Parllel Container node " << containerNameInNS << " on " << theMachine << endl;
+ std::ostringstream tmp;
+ tmp << i;
+ std::string proc_number = tmp.str();
+ std::string container_node_name = name + proc_number;
+ containerNameInNS = _NS->BuildContainerNameForNS((char*) container_node_name.c_str(), theMachine.c_str());
+ INFOS("[LaunchParallelContainer] Waiting for Parallel Container node " << containerNameInNS << " on " << theMachine);
while (CORBA::is_nil(obj) && count) {
#ifndef WIN32
sleep(1) ;
count-- ;
obj = _NS->Resolve(containerNameInNS.c_str());
}
+ if (CORBA::is_nil(obj))
+ {
+ INFOS("[LaunchParallelContainer] Launch of node failed (or not found) !");
+ return obj;
+ }
}
}
-
- if ( CORBA::is_nil(obj) ) {
+ if (CORBA::is_nil(obj))
INFOS("[LaunchParallelContainer] failed");
- }
+
return obj;
}
*/
//=============================================================================
string
-SALOME_ContainerManager::BuildCommandToLaunchLocalParallelContainer(const std::string& exe_name,
- const Engines::MachineParameters& params,
- const std::string& log)
+SALOME_ContainerManager::BuildCommandToLaunchParallelContainer(const std::string& exe_name,
+ const Engines::MachineParameters& params,
+ SALOME_ContainerManager::actual_launch_machine_t & vect_machine,
+ const std::string proxy_hostname)
{
// This method knows the differences between the proxy and the nodes.
// nb_component_nodes is not used in the same way if it is a proxy or
// a node.
-
- string command;
- string parallelLib(CORBA::string_dup(params.parallelLib));
- string hostname(CORBA::string_dup(params.hostname));
- int par = exe_name.find("Proxy");
- int nbproc = params.nb_component_nodes;
- char buffer [33];
- sprintf(buffer,"%d",nbproc);
+
+ //command = "gdb --args ";
+ //command = "valgrind --tool=memcheck --log-file=val_log ";
+ //command += real_exe_name;
+
+ // Step 0 : init some variables...
+ std::string parallelLib(CORBA::string_dup(params.parallelLib));
+ std::string real_exe_name = exe_name + parallelLib;
+ std::string machine_file_name("");
+ bool remote = false;
+ bool is_a_proxy = false;
+ std::string hostname(CORBA::string_dup(params.hostname));
+
+ std::ostringstream tmp_string;
+ CORBA::Long nb_nodes = params.nb_component_nodes;
+ tmp_string << nb_nodes;
+ std::string nbproc = tmp_string.str();
Engines::MachineParameters_var rtn = new Engines::MachineParameters();
rtn->container_name = params.container_name;
rtn->nb_node = params.nb_node;
rtn->isMPI = params.isMPI;
- string real_exe_name = exe_name + parallelLib;
+ // Step 1 : local or remote launch ?
+ if (hostname != std::string(Kernel_Utils::GetHostname()) )
+ {
+ MESSAGE("[BuildCommandToLaunchParallelContainer] remote machine case detected !");
+ remote = true;
+ }
- if (parallelLib == "Dummy")
+ // Step 2 : proxy or nodes launch ?
+ std::string::size_type loc_proxy = exe_name.find("Proxy");
+ if( loc_proxy != string::npos ) {
+ is_a_proxy = true;
+ }
+
+ // Step 3 : Depending of the parallelLib, getting the machine file
+ // ParallelLib Dummy has is own machine for this method
+ if (remote)
+ {
+ if (is_a_proxy)
+ {
+ machine_file_name = _ResManager->getMachineFile(hostname,
+ 1,
+ parallelLib);
+ }
+ else
+ {
+ machine_file_name = _ResManager->getMachineFile(hostname,
+ params.nb_component_nodes,
+ parallelLib);
+ }
+ if (machine_file_name == "")
{
- //command = "gdb --args ";
- //command = "valgrind --tool=memcheck --log-file=val_log ";
- //command += real_exe_name;
+ INFOS("[BuildCommandToLaunchParallelContainer] Error machine_file was not generated for machine " << hostname);
+ throw SALOME_Exception("Error machine_file was not generated");
+ }
+ MESSAGE("[BuildCommandToLaunchParallelContainer] machine_file_name is : " << machine_file_name);
+ }
+
+ // Step 4 : Log type choosen by the user
+ std::string log_env("");
+ char * get_val = getenv("PARALLEL_LOG");
+ if (get_val)
+ log_env = get_val;
+ std::string command_begin("");
+ std::string command_end("");
+ if(log_env == "xterm")
+ {
+ command_begin = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH;";
+ command_end = "\"&";
+ }
+ else if(log_env == "xterm_debug")
+ {
+ command_begin = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH;";
+ command_end = "; cat \" &";
+ }
+ else
+ {
+ // default into a file...
+ std::string logFilename = "/tmp/" + _NS->ContainerName(params) + "_" + hostname;
+ if (is_a_proxy)
+ logFilename += "_Proxy_";
+ else
+ logFilename += "_Node_";
+ logFilename += std::string(getenv("USER")) + ".log";
+ command_end = " > " + logFilename + " 2>&1 & ";
+ }
- command = real_exe_name;
+ // Step 5 : Building the command
+ std::string command("");
+ if (parallelLib == "Dummy")
+ {
+ if (is_a_proxy)
+ {
+ std::string command_remote("");
+ if (remote)
+ {
+ std::string machine_name;
+ std::ifstream machine_file(machine_file_name.c_str());
+ std::getline(machine_file, machine_name);
+ MESSAGE("[BuildCommandToLaunchParallelContainer] machine file name extracted is " << machine_name)
+
+ // We want to launch a command like :
+ // ssh user@machine distantPath/runRemote.sh hostNS portNS
+ const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine_name);
+ if (resInfo.Protocol == rsh)
+ command_remote = "rsh ";
+ else
+ command_remote = "ssh ";
+ command_remote += resInfo.UserName;
+ command_remote += "@";
+ command_remote += machine_name;
+ command_remote += " ";
+ command_remote += resInfo.AppliPath; // path relative to user@machine $HOME
+ command_remote += "/runRemote.sh ";
+ ASSERT(getenv("NSHOST"));
+ command_remote += getenv("NSHOST"); // hostname of CORBA name server
+ command_remote += " ";
+ ASSERT(getenv("NSPORT"));
+ command_remote += getenv("NSPORT"); // port of CORBA name server
+ command_remote += " ";
+
+ hostname = machine_name;
+ }
+ command = real_exe_name;
command += " " + _NS->ContainerName(rtn);
command += " " + parallelLib;
command += " " + hostname;
+ command += " " + nbproc;
command += " -";
AddOmninamesParams(command);
- }
- else if (parallelLib == "Mpi")
+ command = command_begin + command_remote + command + command_end;
+ vect_machine.push_back(hostname);
+ }
+ else
{
- // Step 1 : check if MPI is started
- if (_MpiStarted == false)
- {
- startMPI();
- }
+ std::ifstream * machine_file = NULL;
+ if (remote)
+ machine_file = new std::ifstream(machine_file_name.c_str());
+ for (int i= 0; i < nb_nodes; i++)
+ {
+ std::string command_remote("");
+ if (remote)
+ {
+ std::string machine_name;
+ std::getline(*machine_file, machine_name);
+ MESSAGE("[BuildCommandToLaunchParallelContainer] machine file name extracted is " << machine_name)
+
+ // We want to launch a command like :
+ // ssh user@machine distantPath/runRemote.sh hostNS portNS
+ const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine_name);
+ if (resInfo.Protocol == rsh)
+ command_remote = "rsh ";
+ else
+ command_remote = "ssh ";
+ command_remote += resInfo.UserName;
+ command_remote += "@";
+ command_remote += machine_name;
+ command_remote += " ";
+ command_remote += resInfo.AppliPath; // path relative to user@machine $HOME
+ command_remote += "/runRemote.sh ";
+ ASSERT(getenv("NSHOST"));
+ command_remote += getenv("NSHOST"); // hostname of CORBA name server
+ command_remote += " ";
+ ASSERT(getenv("NSPORT"));
+ command_remote += getenv("NSPORT"); // port of CORBA name server
+ command_remote += " ";
+
+ hostname = machine_name;
+ }
- if (par < 0)
- {
- // Nodes case
-
- command = "mpiexec -np " + string(buffer) + " ";
- // command += "gdb --args ";
- command += real_exe_name;
- command += " " + _NS->ContainerName(rtn);
- command += " " + parallelLib;
- command += " " + hostname;
- command += " -";
- AddOmninamesParams(command);
- }
- else
- {
- // Proxy case
- command = "mpiexec -np 1 ";
- command += real_exe_name;
- command += " " + _NS->ContainerName(rtn);
- command += " " + string(buffer);
- command += " " + parallelLib;
- command += " " + hostname;
- command += " -";
- AddOmninamesParams(command);
- }
+ std::ostringstream tmp;
+ tmp << i;
+ std::string proc_number = tmp.str();
+
+ std::string command_tmp("");
+ command_tmp += real_exe_name;
+ command_tmp += " " + _NS->ContainerName(rtn);
+ command_tmp += " " + parallelLib;
+ command_tmp += " " + proxy_hostname;
+ command_tmp += " " + proc_number;
+ command_tmp += " -";
+ AddOmninamesParams(command_tmp);
+
+ // On change _Node_ par _Nodex_ pour avoir chaque noeud
+ // sur un fichier
+ std::string command_end_tmp = command_end;
+ std::string::size_type loc_node = command_end_tmp.find("_Node_");
+ if (loc_node != std::string::npos)
+ command_end_tmp.insert(loc_node+5, proc_number);
+ command += command_begin + command_remote + command_tmp + command_end_tmp;
+ vect_machine.push_back(hostname);
+ }
+ if (machine_file)
+ delete machine_file;
}
- else
+ }
+ else if (parallelLib == "Mpi")
+ {
+ // Step 0: if remote we have to copy the file
+ // to the first machine of the file
+ std::string remote_machine("");
+ if (remote)
{
- std::string message("Unknown parallelLib" + parallelLib);
- throw SALOME_Exception(message.c_str());
+ std::ifstream * machine_file = NULL;
+ machine_file = new std::ifstream(machine_file_name.c_str());
+ // Get first word of the line
+ // For MPI implementation the first word is the
+ // machine name
+ std::getline(*machine_file, remote_machine, ' ');
+ machine_file->close();
+ MESSAGE("[BuildCommandToLaunchParallelContainer] machine file name extracted is " << remote_machine)
+
+ // We want to launch a command like :
+ // scp mpi_machine_file user@machine:Path
+ std::string command_remote("");
+ const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(remote_machine);
+ if (resInfo.Protocol == rsh)
+ command_remote = "rcp ";
+ else
+ command_remote = "scp ";
+
+ command_remote += machine_file_name;
+ command_remote += " ";
+ command_remote += resInfo.UserName;
+ command_remote += "@";
+ command_remote += remote_machine;
+ command_remote += ":";
+ command_remote += machine_file_name;
+
+ int status = system(command_remote.c_str());
+ if (status == -1)
+ {
+ INFOS("copy of the mpi machine file failed !");
+ return "";
+ }
}
- // log choice
- if (log == "default")
+ if (is_a_proxy)
{
- command += " > /tmp/";
- command += _NS->ContainerName(rtn);
- command += "_";
- command += Kernel_Utils::GetHostname();
- command += "_";
- command += getenv( "USER" ) ;
- command += ".log 2>&1 &" ;
+ std::string command_remote("");
+ if (remote)
+ {
+ // We want to launch a command like :
+ // ssh user@machine distantPath/runRemote.sh hostNS portNS
+ const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(remote_machine);
+ if (resInfo.Protocol == rsh)
+ command_remote = "rsh ";
+ else
+ command_remote = "ssh ";
+ command_remote += resInfo.UserName;
+ command_remote += "@";
+ command_remote += remote_machine;
+ command_remote += " ";
+ command_remote += resInfo.AppliPath; // path relative to user@machine $HOME
+ command_remote += "/runRemote.sh ";
+ ASSERT(getenv("NSHOST"));
+ command_remote += getenv("NSHOST"); // hostname of CORBA name server
+ command_remote += " ";
+ ASSERT(getenv("NSPORT"));
+ command_remote += getenv("NSPORT"); // port of CORBA name server
+ command_remote += " ";
+
+ hostname = remote_machine;
+ }
+
+ // We use Dummy proxy for MPI parallel containers
+ real_exe_name = exe_name + "Dummy";
+ command = real_exe_name;
+ command += " " + _NS->ContainerName(rtn);
+ command += " Dummy";
+ command += " " + hostname;
+ command += " " + nbproc;
+ command += " -";
+ AddOmninamesParams(command);
+
+ command = command_begin + command_remote + command + command_end;
+ vect_machine.push_back(hostname);
}
- if (log == "xterm")
+ else
{
- command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; "
- + command + " \" &";
- // + command + "; echo $LD_LIBRARY_PATH; cat \" &";
- }
- return command;
+ std::string command_remote("");
+ if (remote)
+ {
+ const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(remote_machine);
+ if (resInfo.Protocol == rsh)
+ command_remote = "rsh ";
+ else
+ command_remote = "ssh ";
+ command_remote += resInfo.UserName;
+ command_remote += "@";
+ command_remote += remote_machine;
+ command_remote += " ";
+
+ std::string new_real_exe_name("");
+ new_real_exe_name += resInfo.AppliPath; // path relative to user@machine $HOME
+ new_real_exe_name += "/runRemote.sh ";
+ ASSERT(getenv("NSHOST"));
+ new_real_exe_name += getenv("NSHOST"); // hostname of CORBA name server
+ new_real_exe_name += " ";
+ ASSERT(getenv("NSPORT"));
+ new_real_exe_name += getenv("NSPORT"); // port of CORBA name server
+ new_real_exe_name += " ";
+
+ real_exe_name = new_real_exe_name + real_exe_name;
+ hostname = remote_machine;
+ }
- /* if (log == "xterm")
+ const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(hostname);
+ if (resInfo.mpi == lam)
{
- command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; echo $LD_LIBRARY_PATH; echo $PATH; " + command + "; cat \" &";
+ command = "mpiexec -ssi boot ";
+ if (resInfo.Protocol == rsh)
+ command += "rsh ";
+ else
+ command += "ssh ";
+ command += "-machinefile " + machine_file_name + " ";
+ command += "-n " + nbproc + " ";
+ command += real_exe_name;
+ command += " " + _NS->ContainerName(rtn);
+ command += " " + parallelLib;
+ command += " " + proxy_hostname;
+ command += " -";
+ AddOmninamesParams(command);
+ }
+ else
+ {
+ command = "mpirun -np " + nbproc + " ";
+ command += real_exe_name;
+ command += " " + _NS->ContainerName(rtn);
+ command += " " + parallelLib;
+ command += " " + proxy_hostname;
+ command += " -";
+ AddOmninamesParams(command);
}
- */
- /* command = "cd ; rm " + fichier_commande + "; touch " + \
- fichier_commande + "; echo \" export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; " + \
- command + " >& /tmp/ribes_" + fichier_commande + " & \" > " + fichier_commande + ";";
- command += "ssh cn01 sh " + fichier_commande + " &";
- cerr << "La commande : " << command << endl;
- */
-}
-void SALOME_ContainerManager::startMPI()
-{
- cerr << "----------------------------------------------" << endl;
- cerr << "----------------------------------------------" << endl;
- cerr << "----------------------------------------------" << endl;
- cerr << "-Only Lam on Localhost is currently supported-" << endl;
- cerr << "----------------------------------------------" << endl;
- cerr << "----------------------------------------------" << endl;
- cerr << "----------------------------------------------" << endl;
-
- int status = system("lamboot");
- if (status == -1)
- {
- INFOS("lamboot failed : system command status -1");
- }
- else if (status == 217)
- {
- INFOS("lamboot failed : system command status 217");
+ command = command_begin + command_remote + command + command_end;
+ for (int i= 0; i < nb_nodes; i++)
+ vect_machine.push_back(proxy_hostname);
}
+ }
else
- {
- _MpiStarted = true;
- }
+ {
+ std::string message("Unknown parallelLib : " + parallelLib);
+ throw SALOME_Exception(message.c_str());
+ }
+
+ MESSAGE("Parallel launch is: " << command);
+ return command;
}
string SALOME_ContainerManager::GetMPIZeroNode(string machine)