1 // Copyright (C) 2007-2008 CEA/DEN, EDF R&D, OPEN CASCADE
3 // Copyright (C) 2003-2007 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
4 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU Lesser General Public
8 // License as published by the Free Software Foundation; either
9 // version 2.1 of the License.
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // Lesser General Public License for more details.
16 // You should have received a copy of the GNU Lesser General Public
17 // License along with this library; if not, write to the Free Software
18 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
22 #include "SALOME_ContainerManager.hxx"
23 #include "SALOME_NamingService.hxx"
24 #include "SALOME_ModuleCatalog.hh"
25 #include "Basics_Utils.hxx"
26 #include "Basics_DirUtils.hxx"
27 #include <sys/types.h>
33 #include "Utils_CorbaException.hxx"
34 #include "Batch_Date.hxx"
37 #ifdef WITH_PACO_PARALLEL
41 #define TIME_OUT_TO_LAUNCH_CONT 61
45 vector<Engines::Container_ptr> SALOME_ContainerManager::_batchLaunchedContainers;
47 vector<Engines::Container_ptr>::iterator SALOME_ContainerManager::_batchLaunchedContainersIter;
49 const char *SALOME_ContainerManager::_ContainerManagerNameInNS =
52 //=============================================================================
56 * Define a CORBA single thread policy for the server, which avoid to deal
57 * with non thread-safe usage like Change_Directory in SALOME naming service
59 //=============================================================================
61 SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, SALOME_ResourcesManager *rm, SALOME_NamingService *ns)
63 MESSAGE("constructor");
67 PortableServer::POAManager_var pman = poa->the_POAManager();
68 _orb = CORBA::ORB::_duplicate(orb) ;
69 CORBA::PolicyList policies;
71 PortableServer::ThreadPolicy_var threadPol =
72 poa->create_thread_policy(PortableServer::SINGLE_THREAD_MODEL);
73 policies[0] = PortableServer::ThreadPolicy::_duplicate(threadPol);
75 _poa = poa->create_POA("SThreadPOA",pman,policies);
77 PortableServer::ObjectId_var id = _poa->activate_object(this);
78 CORBA::Object_var obj = _poa->id_to_reference(id);
79 Engines::ContainerManager_var refContMan =
80 Engines::ContainerManager::_narrow(obj);
82 _NS->Register(refContMan,_ContainerManagerNameInNS);
83 _isAppliSalomeDefined = (getenv("APPLI") != 0);
84 MESSAGE("constructor end");
87 //=============================================================================
91 //=============================================================================
93 SALOME_ContainerManager::~SALOME_ContainerManager()
95 MESSAGE("destructor");
98 //=============================================================================
99 //! shutdown all the containers, then the ContainerManager servant
102 //=============================================================================
104 void SALOME_ContainerManager::Shutdown()
107 ShutdownContainers();
108 _NS->Destroy_Name(_ContainerManagerNameInNS);
109 PortableServer::ObjectId_var oid = _poa->servant_to_id(this);
110 _poa->deactivate_object(oid);
113 //=============================================================================
114 //! Loop on all the containers listed in naming service, ask shutdown on each
117 //=============================================================================
119 void SALOME_ContainerManager::ShutdownContainers()
121 MESSAGE("ShutdownContainers");
123 isOK = _NS->Change_Directory("/Containers");
125 vector<string> vec = _NS->list_directory_recurs();
126 list<string> lstCont;
127 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++)
130 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
133 Engines::Container_var cont=Engines::Container::_narrow(obj);
134 if(!CORBA::is_nil(cont))
135 lstCont.push_back((*iter));
137 catch(const CORBA::Exception& e)
139 // ignore this entry and continue
142 MESSAGE("Container list: ");
143 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++){
146 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++)
151 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
152 Engines::Container_var cont=Engines::Container::_narrow(obj);
153 if(!CORBA::is_nil(cont))
155 MESSAGE("ShutdownContainers: " << (*iter));
159 MESSAGE("ShutdownContainers: no container ref for " << (*iter));
161 catch(CORBA::SystemException& e)
163 INFOS("CORBA::SystemException ignored : " << e);
165 catch(CORBA::Exception&)
167 INFOS("CORBA::Exception ignored.");
171 INFOS("Unknown exception ignored.");
177 //=============================================================================
178 //! Give a suitable Container given constraints
180 * \param params Machine Parameters required for the container
181 * \return the container or nil
183 //=============================================================================
185 Engines::Container_ptr
186 SALOME_ContainerManager::GiveContainer(const Engines::MachineParameters& params)
188 char *valenv=getenv("SALOME_BATCH");
190 if (strcmp(valenv,"1")==0)
192 if(_batchLaunchedContainers.empty())
193 fillBatchLaunchedContainers();
195 if (_batchLaunchedContainersIter == _batchLaunchedContainers.end())
196 _batchLaunchedContainersIter = _batchLaunchedContainers.begin();
198 Engines::Container_ptr rtn = Engines::Container::_duplicate(*_batchLaunchedContainersIter);
199 _batchLaunchedContainersIter++;
202 return StartContainer(params);
205 //=============================================================================
206 //! Start a suitable Container in a list of machines with constraints
208 * Constraints are given by a machine parameters struct
209 * \param params Machine Parameters required for the container
210 * \param possibleComputers list of machines usable for start
211 * \param container_exe specific container executable (default=SALOME_Container)
213 //=============================================================================
215 Engines::Container_ptr
216 SALOME_ContainerManager::StartContainer(const Engines::MachineParameters& params,
217 const Engines::MachineList& possibleComputers,
218 const std::string& container_exe)
220 #ifdef WITH_PACO_PARALLEL
221 std::string parallelLib(params.parallelLib);
222 if (parallelLib != "")
224 Engines::MachineParameters myparams(params);
225 myparams.computerList=possibleComputers;
226 return StartParallelContainer(myparams);
229 string containerNameInNS;
230 Engines::Container_ptr ret = Engines::Container::_nil();
232 MESSAGE("SALOME_ContainerManager::StartContainer " << possibleComputers.length());
235 // if mode is "get" keep only machines with existing containers
236 if(std::string(params.mode.in())=="get")
238 for(unsigned int i=0;i<possibleComputers.length();i++)
240 Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
243 if(!cont->_non_existent())
244 lm.push_back(string(possibleComputers[i]));
246 catch(CORBA::Exception&)
248 // CORBA::Exception ignored.
254 for(unsigned int i=0;i<possibleComputers.length();i++)
255 lm.push_back(string(possibleComputers[i]));
261 theMachine=_ResManager->GetImpl()->Find(params.policy.in(),lm);
263 catch( const SALOME_Exception &ex )
266 return Engines::Container::_nil();
269 //If the machine name is localhost use the real name
270 if(theMachine == "localhost")
271 theMachine=Kernel_Utils::GetHostname();
273 //check if an entry exists in Naming service
274 //if params.mode == "start" or "" shutdown the existing container before launching a new one with that name
275 //if params.mode == "getorstart" or "get" use the existing container
276 containerNameInNS = _NS->BuildContainerNameForNS(params,theMachine.c_str());
278 SCRUTE(containerNameInNS);
279 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
280 if ( !CORBA::is_nil(obj) )
284 Engines::Container_var cont=Engines::Container::_narrow(obj);
285 if(!cont->_non_existent())
287 if(std::string(params.mode.in())=="getorstart"||std::string(params.mode.in())=="get")
288 return cont._retn(); /* the container exists and params.mode is getorstart or get use it*/
291 INFOS("A container is already registered with the name: " << containerNameInNS << ", shutdown the existing container");
292 cont->Shutdown(); // shutdown the registered container if it exists
296 catch(CORBA::Exception&)
298 INFOS("CORBA::Exception ignored.");
302 //try to launch a new container
303 MESSAGE("try to launch it on " << theMachine);
307 MESSAGE("SALOME_ContainerManager::StartContainer : no possible computer");
308 return Engines::Container::_nil();
310 else if(theMachine==Kernel_Utils::GetHostname())
311 command = BuildCommandToLaunchLocalContainer(params,container_exe);
313 command = BuildCommandToLaunchRemoteContainer(theMachine,params,container_exe);
315 //redirect stdout and stderr in a file
316 string logFilename="/tmp/"+_NS->ContainerName(params)+"_"+ theMachine +"_"+getenv( "USER" )+".log" ;
317 command += " > " + logFilename + " 2>&1 &";
319 // launch container with a system call
320 int status=system(command.c_str());
323 MESSAGE("SALOME_ContainerManager::StartContainer rsh failed (system command status -1)");
324 RmTmpFile(_TmpFileName); // command file can be removed here
325 return Engines::Container::_nil();
327 else if (status == 217){
328 MESSAGE("SALOME_ContainerManager::StartContainer rsh failed (system command status 217)");
329 RmTmpFile(_TmpFileName); // command file can be removed here
330 return Engines::Container::_nil();
333 int count=TIME_OUT_TO_LAUNCH_CONT;
334 MESSAGE("count = "<<count);
335 while ( CORBA::is_nil(ret) && count ){
343 MESSAGE( count << ". Waiting for container on " << theMachine);
345 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
346 ret=Engines::Container::_narrow(obj);
349 if ( CORBA::is_nil(ret) )
351 MESSAGE("SALOME_ContainerManager::StartContainer rsh failed");
355 logFilename=":"+logFilename;
356 logFilename="@"+Kernel_Utils::GetHostname()+logFilename;
357 logFilename=getenv( "USER" )+logFilename;
358 ret->logfilename(logFilename.c_str());
361 RmTmpFile(_TmpFileName); // command file can be removed here
366 //=============================================================================
367 //! Start a suitable Container given constraints
369 * \param params Machine Parameters required for the container
371 //=============================================================================
373 Engines::Container_ptr
374 SALOME_ContainerManager::StartContainer(const Engines::MachineParameters& params)
376 Engines::MachineList_var possibleComputers = _ResManager->GetFittingResources(params);
378 // Look into ModulCatalog if a specific container must be launched
379 CORBA::String_var container_exe;
383 CORBA::Object_var obj = _NS->Resolve("/Kernel/ModulCatalog");
384 SALOME_ModuleCatalog::ModuleCatalog_var Catalog = SALOME_ModuleCatalog::ModuleCatalog::_narrow(obj) ;
385 if (CORBA::is_nil (Catalog))
386 return Engines::Container::_nil();
387 // Loop through component list
388 for(unsigned int i=0;i<params.componentList.length();i++)
390 const char* compoi = params.componentList[i];
391 SALOME_ModuleCatalog::Acomponent_var compoInfo = Catalog->GetComponent(compoi);
392 if (CORBA::is_nil (compoInfo))
396 SALOME_ModuleCatalog::ImplType impl=compoInfo->implementation_type();
397 container_exe=compoInfo->implementation_name();
398 if(impl==SALOME_ModuleCatalog::CEXE)
402 INFOS("ContainerManager Error: you can't have 2 CEXE component in the same container" );
403 return Engines::Container::_nil();
409 catch (ServiceUnreachable&)
411 INFOS("Caught exception: Naming Service Unreachable");
412 return Engines::Container::_nil();
416 INFOS("Caught unknown exception.");
417 return Engines::Container::_nil();
421 return StartContainer(params,possibleComputers,container_exe.in());
423 return StartContainer(params,possibleComputers);
426 //=============================================================================
427 //! Find or start a suitable Container given some constraints
429 * \param params Machine Parameters required for the container
430 * \return the container or nil
432 //=============================================================================
434 Engines::Container_ptr
435 SALOME_ContainerManager::FindOrStartContainer(const Engines::MachineParameters& params)
437 Engines::Container_ptr ret = FindContainer(params,params.computerList);
438 if(!CORBA::is_nil(ret))
440 MESSAGE("Container doesn't exist try to launch it ...");
442 return StartContainer(params);
445 //=============================================================================
446 //! Find a container given constraints (params) on a list of machines (possibleComputers)
450 //=============================================================================
452 Engines::Container_ptr
453 SALOME_ContainerManager::FindContainer(const Engines::MachineParameters& params,
454 const Engines::MachineList& possibleComputers)
456 MESSAGE("FindContainer "<<possibleComputers.length());
457 for(unsigned int i=0;i<possibleComputers.length();i++)
459 MESSAGE("FindContainer possible " << possibleComputers[i]);
460 Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
461 if( !CORBA::is_nil(cont) )
464 MESSAGE("FindContainer: not found");
465 return Engines::Container::_nil();
468 //=============================================================================
469 //! Find a container given constraints (params) on a machine (theMachine)
473 //=============================================================================
475 Engines::Container_ptr
476 SALOME_ContainerManager::FindContainer(const Engines::MachineParameters& params,
477 const char *theMachine)
479 string containerNameInNS(_NS->BuildContainerNameForNS(params,theMachine));
480 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
483 if(obj->_non_existent())
484 return Engines::Container::_nil();
486 return Engines::Container::_narrow(obj);
488 catch(const CORBA::Exception& e)
490 return Engines::Container::_nil();
494 #ifdef WITH_PACO_PARALLEL
495 //=============================================================================
497 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
498 * \param params Machine Parameters required for the container
499 * \return CORBA container reference.
501 //=============================================================================
502 Engines::Container_ptr
503 SALOME_ContainerManager::StartParallelContainer(const Engines::MachineParameters& params_const)
505 CORBA::Object_var obj;
506 PaCO::InterfaceManager_var container_proxy;
507 Engines::Container_ptr ret = Engines::Container::_nil();
508 Engines::MachineParameters params(params_const);
510 // Step 1 : Try to find a suitable container
511 // Currently not as good as could be since
512 // we have to verified the number of nodes of the container
513 // if a user tell that.
514 ret = FindContainer(params, params.computerList);
515 if(CORBA::is_nil(ret)) {
516 // Step 2 : Starting a new parallel container !
517 INFOS("[StartParallelContainer] Starting a PaCO++ parallel container");
519 // Step 3 : Choose a computer
520 std::string theMachine = _ResManager->FindFirst(params.computerList);
521 //If the machine name is localhost use the real name
522 if(theMachine == "localhost")
523 theMachine=Kernel_Utils::GetHostname();
525 if(theMachine == "") {
526 INFOS("[StartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
527 INFOS("[StartParallelContainer] No possible computer found");
528 INFOS("[StartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
531 INFOS("[StartParallelContainer] on machine : " << theMachine);
532 params.hostname = CORBA::string_dup(theMachine.c_str());
534 // Step 4 : starting parallel container proxy
535 Engines::MachineParameters params_proxy(params);
536 std::string command_proxy;
537 SALOME_ContainerManager::actual_launch_machine_t proxy_machine;
540 command_proxy = BuildCommandToLaunchParallelContainer("SALOME_ParallelContainerProxy", params_proxy, proxy_machine);
542 catch(const SALOME_Exception & ex)
544 INFOS("[StartParallelContainer] Exception in BuildCommandToLaunchParallelContainer");
548 params_proxy.nb_component_nodes = 0; // LaunchParallelContainer uses this value to know if it launches the proxy or the nodes
549 obj = LaunchParallelContainer(command_proxy, params_proxy, _NS->ContainerName(params_proxy), proxy_machine);
550 if (CORBA::is_nil(obj))
552 INFOS("[StartParallelContainer] LaunchParallelContainer for proxy returns NIL !");
557 container_proxy = PaCO::InterfaceManager::_narrow(obj);
559 catch(CORBA::SystemException& e)
561 INFOS("[StartParallelContainer] Exception in _narrow after LaunchParallelContainer for proxy !");
562 INFOS("CORBA::SystemException : " << e);
565 catch(CORBA::Exception& e)
567 INFOS("[StartParallelContainer] Exception in _narrow after LaunchParallelContainer for proxy !");
568 INFOS("CORBA::Exception" << e);
573 INFOS("[StartParallelContainer] Exception in _narrow after LaunchParallelContainer for proxy !");
574 INFOS("Unknown exception !");
577 if (CORBA::is_nil(container_proxy))
579 INFOS("[StartParallelContainer] PaCO::InterfaceManager::_narrow returns NIL !");
583 // Step 5 : starting parallel container nodes
584 std::string command_nodes;
585 Engines::MachineParameters params_nodes(params);
586 SALOME_ContainerManager::actual_launch_machine_t nodes_machines;
589 command_nodes = BuildCommandToLaunchParallelContainer("SALOME_ParallelContainerNode", params_nodes, nodes_machines, proxy_machine[0]);
591 catch(const SALOME_Exception & ex)
593 INFOS("[StartParallelContainer] Exception in BuildCommandToLaunchParallelContainer");
597 std::string container_generic_node_name = _NS->ContainerName(params) + "Node";
598 obj = LaunchParallelContainer(command_nodes, params_nodes, container_generic_node_name, nodes_machines);
599 if (CORBA::is_nil(obj))
601 INFOS("[StartParallelContainer] LaunchParallelContainer for nodes returns NIL !");
602 // Il faut tuer le proxy
605 Engines::Container_var proxy = Engines::Container::_narrow(container_proxy);
610 INFOS("[StartParallelContainer] Exception catched from proxy Shutdown...");
615 // Step 6 : connecting nodes and the proxy to actually create a parallel container
616 for (int i = 0; i < params.nb_component_nodes; i++)
618 std::ostringstream tmp;
620 std::string proc_number = tmp.str();
621 std::string container_node_name = container_generic_node_name + proc_number;
623 std::string theNodeMachine(nodes_machines[i]);
624 std::string containerNameInNS = _NS->BuildContainerNameForNS(container_node_name.c_str(), theNodeMachine.c_str());
625 obj = _NS->Resolve(containerNameInNS.c_str());
626 if (CORBA::is_nil(obj))
628 INFOS("[StartParallelContainer] CONNECTION FAILED From Naming Service !");
629 INFOS("[StartParallelContainer] Container name is " << containerNameInNS);
634 MESSAGE("[StartParallelContainer] Deploying node : " << container_node_name);
635 PaCO::InterfaceParallel_var node = PaCO::InterfaceParallel::_narrow(obj);
637 MESSAGE("[StartParallelContainer] node " << container_node_name << " is deployed");
639 catch(CORBA::SystemException& e)
641 INFOS("[StartParallelContainer] Exception in deploying node : " << containerNameInNS);
642 INFOS("CORBA::SystemException : " << e);
645 catch(CORBA::Exception& e)
647 INFOS("[StartParallelContainer] Exception in deploying node : " << containerNameInNS);
648 INFOS("CORBA::Exception" << e);
653 INFOS("[StartParallelContainer] Exception in deploying node : " << containerNameInNS);
654 INFOS("Unknown exception !");
659 // Step 7 : starting parallel container
662 MESSAGE ("[StartParallelContainer] Starting parallel object");
663 container_proxy->start();
664 MESSAGE ("[StartParallelContainer] Parallel object is started");
665 ret = Engines::Container::_narrow(container_proxy);
667 catch(CORBA::SystemException& e)
669 INFOS("Caught CORBA::SystemException. : " << e);
671 catch(PortableServer::POA::ServantAlreadyActive&)
673 INFOS("Caught CORBA::ServantAlreadyActiveException");
675 catch(CORBA::Exception&)
677 INFOS("Caught CORBA::Exception.");
679 catch(std::exception& exc)
681 INFOS("Caught std::exception - "<<exc.what());
685 INFOS("Caught unknown exception.");
691 //=============================================================================
693 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
694 * \param params Machine Parameters required for the container
695 * \return CORBA container reference.
697 //=============================================================================
698 Engines::Container_ptr
699 SALOME_ContainerManager::StartParallelContainer(const Engines::MachineParameters& params)
701 Engines::Container_ptr ret = Engines::Container::_nil();
702 INFOS("[StartParallelContainer] is disabled !");
703 INFOS("[StartParallelContainer] recompile SALOME Kernel to enable parallel extension");
708 //=============================================================================
709 /*! This method launches the parallel container.
710 * It will may be placed on the ressources manager.
712 * \param command to launch
713 * \param container's parameters
714 * \param name of the container
716 * \return CORBA container reference
718 //=============================================================================
720 SALOME_ContainerManager::LaunchParallelContainer(const std::string& command,
721 const Engines::MachineParameters& params,
722 const std::string& name,
723 SALOME_ContainerManager::actual_launch_machine_t & vect_machine)
725 CORBA::Object_ptr obj = CORBA::Object::_nil();
726 std::string containerNameInNS;
727 int count = TIME_OUT_TO_LAUNCH_CONT;
729 INFOS("[LaunchParallelContainer] Begin");
730 int status = system(command.c_str());
732 INFOS("[LaunchParallelContainer] failed : system command status -1");
735 else if (status == 217) {
736 INFOS("[LaunchParallelContainer] failed : system command status 217");
740 if (params.nb_component_nodes == 0)
742 std::string theMachine(vect_machine[0]);
743 // Proxy We have launch a proxy
744 containerNameInNS = _NS->BuildContainerNameForNS((char*) name.c_str(), theMachine.c_str());
745 INFOS("[LaunchParallelContainer] Waiting for Parallel Container proxy " << containerNameInNS << " on " << theMachine);
746 while (CORBA::is_nil(obj) && count)
754 obj = _NS->Resolve(containerNameInNS.c_str());
759 INFOS("[LaunchParallelContainer] launching the nodes of the parallel container");
760 // We are waiting all the nodes
761 for (int i = 0; i < params.nb_component_nodes; i++)
763 obj = CORBA::Object::_nil();
764 std::string theMachine(vect_machine[i]);
766 std::ostringstream tmp;
768 std::string proc_number = tmp.str();
769 std::string container_node_name = name + proc_number;
770 containerNameInNS = _NS->BuildContainerNameForNS((char*) container_node_name.c_str(), theMachine.c_str());
771 INFOS("[LaunchParallelContainer] Waiting for Parallel Container node " << containerNameInNS << " on " << theMachine);
772 while (CORBA::is_nil(obj) && count) {
779 obj = _NS->Resolve(containerNameInNS.c_str());
781 if (CORBA::is_nil(obj))
783 INFOS("[LaunchParallelContainer] Launch of node failed (or not found) !");
788 if (CORBA::is_nil(obj))
789 INFOS("[LaunchParallelContainer] failed");
794 void SALOME_ContainerManager::fillBatchLaunchedContainers()
796 _batchLaunchedContainers.clear();
797 _NS->Change_Directory("/Containers");
798 vector<string> vec = _NS->list_directory_recurs();
799 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++){
800 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
801 Engines::Container_ptr cont=Engines::Container::_narrow(obj);
802 if(!CORBA::is_nil(cont)){
803 _batchLaunchedContainers.push_back(cont);
806 _batchLaunchedContainersIter=_batchLaunchedContainers.begin();
809 //=============================================================================
811 * This is no longer valid (C++ container are also python containers)
813 //=============================================================================
815 bool isPythonContainer(const char* ContainerName)
818 int len = strlen(ContainerName);
821 if (strcmp(ContainerName + len - 2, "Py") == 0)
827 //=============================================================================
829 * Builds the script to be launched
831 * If SALOME Application not defined ($APPLI),
832 * see BuildTempFileToLaunchRemoteContainer()
834 * Else rely on distant configuration. Command is under the form (example):
835 * ssh user@machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \
836 * SALOME_Container containerName &"
838 * - where user is ommited if not specified in CatalogResources,
839 * - where distant path is always relative to user@machine $HOME, and
840 * equal to $APPLI if not specified in CatalogResources,
841 * - where hostNS is the hostname of CORBA naming server (set by scripts to
842 * use to launch SALOME and servers in $APPLI: runAppli.sh, runRemote.sh)
843 * - where portNS is the port used by CORBA naming server (set by scripts to
844 * use to launch SALOME and servers in $APPLI: runAppli.sh, runRemote.sh)
845 * - where workingdir is the requested working directory for the container.
846 * If WORKINGDIR (and workingdir) is not present the working dir will be $HOME
848 //=============================================================================
851 SALOME_ContainerManager::BuildCommandToLaunchRemoteContainer
852 (const string& machine,
853 const Engines::MachineParameters& params, const std::string& container_exe)
858 if ( ! _isAppliSalomeDefined )
859 command = BuildTempFileToLaunchRemoteContainer(machine, params);
863 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine);
867 if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) )
869 else if ( params.nb_node == 0 )
870 nbproc = params.nb_proc_per_node;
871 else if ( params.nb_proc_per_node == 0 )
872 nbproc = params.nb_node;
874 nbproc = params.nb_node * params.nb_proc_per_node;
877 // "ssh user@machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \
878 // SALOME_Container containerName &"
880 if (resInfo.Protocol == rsh)
882 else if (resInfo.Protocol == ssh)
885 throw SALOME_Exception("Unknown protocol");
887 if (resInfo.UserName != "")
889 command += resInfo.UserName;
896 if (resInfo.AppliPath != "")
897 command += resInfo.AppliPath; // path relative to user@machine $HOME
900 ASSERT(getenv("APPLI"));
901 command += getenv("APPLI"); // path relative to user@machine $HOME
904 command += "/runRemote.sh ";
906 ASSERT(getenv("NSHOST"));
907 command += getenv("NSHOST"); // hostname of CORBA name server
910 ASSERT(getenv("NSPORT"));
911 command += getenv("NSPORT"); // port of CORBA name server
913 std::string wdir=params.workingdir.in();
916 command += " WORKINGDIR ";
918 if(wdir == "$TEMPDIR")
920 command += wdir; // requested working directory
926 command += " mpirun -np ";
927 std::ostringstream o;
931 command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
932 #elif defined(WITHOPENMPI)
933 if( getenv("OMPI_URI_FILE") == NULL )
934 command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
936 command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
937 command += getenv("OMPI_URI_FILE");
940 command += " SALOME_MPIContainer ";
943 command += " " +container_exe+ " ";
945 command += _NS->ContainerName(params);
947 AddOmninamesParams(command);
949 MESSAGE("command =" << command);
955 //=============================================================================
957 * builds the command to be launched.
959 //=============================================================================
962 SALOME_ContainerManager::BuildCommandToLaunchLocalContainer
963 (const Engines::MachineParameters& params, const std::string& container_exe)
965 _TmpFileName = BuildTemporaryFileName();
969 ofstream command_file( _TmpFileName.c_str() );
973 //command = "mpirun -np ";
974 command_file << "mpirun -np ";
976 if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) )
978 else if ( params.nb_node == 0 )
979 nbproc = params.nb_proc_per_node;
980 else if ( params.nb_proc_per_node == 0 )
981 nbproc = params.nb_node;
983 nbproc = params.nb_node * params.nb_proc_per_node;
985 //std::ostringstream o;
987 //o << nbproc << " ";
988 command_file << nbproc << " ";
990 //command += o.str();
992 //command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
993 command_file << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
994 #elif defined(WITHOPENMPI)
995 //command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace ";
996 if( getenv("OMPI_URI_FILE") == NULL )
997 command_file << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
1000 command_file << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
1001 command_file << getenv("OMPI_URI_FILE");
1005 if (isPythonContainer(params.container_name))
1006 //command += "pyMPI SALOME_ContainerPy.py ";
1007 command_file << " pyMPI SALOME_ContainerPy.py ";
1009 //command += "SALOME_MPIContainer ";
1010 command_file << " SALOME_MPIContainer ";
1016 std::string wdir=params.workingdir.in();
1019 // a working directory is requested
1020 if(wdir == "$TEMPDIR")
1022 // a new temporary directory is requested
1023 string dir = Kernel_Utils::GetTmpDir();
1025 //command += "cd /d "+ dir +";";
1026 command_file << "cd /d " << dir << endl;
1028 //command = "cd "+ dir +";";
1029 command_file << "cd " << dir << ";";
1035 // a permanent directory is requested use it or create it
1037 //command="mkdir " + wdir;
1038 command_file << "mkdir " + wdir << endl;
1039 command_file << "cd /D " + wdir << endl;
1041 //command="mkdir -p " + wdir + " && cd " + wdir + ";";
1042 command_file << "mkdir -p " << wdir << " && cd " << wdir + ";";
1046 if (isPythonContainer(params.container_name))
1047 //command += "SALOME_ContainerPy.py ";
1048 command_file << "SALOME_ContainerPy.py ";
1050 //command += container_exe + " ";
1051 command_file << container_exe + " ";
1055 command_file << _NS->ContainerName(params);
1056 command_file << " -";
1057 AddOmninamesParams(command_file);
1058 command_file.close();
1061 chmod(_TmpFileName.c_str(), 0x1ED);
1063 command = _TmpFileName;
1065 MESSAGE("Command is file ... " << command);
1070 //=============================================================================
1072 * removes the generated temporary file in case of a remote launch.
1074 //=============================================================================
1076 void SALOME_ContainerManager::RmTmpFile(std::string& tmpFileName)
1078 int lenght = tmpFileName.size();
1082 string command = "del /F ";
1084 string command = "rm ";
1087 command += tmpFileName.substr(0, lenght - 3 );
1089 command += tmpFileName;
1091 system(command.c_str());
1092 //if dir is empty - remove it
1093 string tmp_dir = Kernel_Utils::GetDirByPath( tmpFileName );
1094 if ( Kernel_Utils::IsEmptyDir( tmp_dir ) )
1097 command = "del /F " + tmp_dir;
1099 command = "rmdir " + tmp_dir;
1101 system(command.c_str());
1106 //=============================================================================
1108 * add to command all options relative to naming service.
1110 //=============================================================================
1112 void SALOME_ContainerManager::AddOmninamesParams(string& command) const
1114 CORBA::String_var iorstr = _NS->getIORaddr();
1115 command += "ORBInitRef NameService=";
1120 //=============================================================================
1122 * add to command all options relative to naming service.
1124 //=============================================================================
1126 void SALOME_ContainerManager::AddOmninamesParams(ofstream& fileStream) const
1128 CORBA::String_var iorstr = _NS->getIORaddr();
1129 fileStream << "ORBInitRef NameService=";
1130 fileStream << iorstr;
1133 //=============================================================================
1135 * generate a file name in /tmp directory
1137 //=============================================================================
1139 string SALOME_ContainerManager::BuildTemporaryFileName() const
1141 //build more complex file name to support multiple salome session
1142 string aFileName = Kernel_Utils::GetTmpFileName();
1146 aFileName += ".bat";
1152 //=============================================================================
1154 * Builds in a temporary file the script to be launched.
1156 * Used if SALOME Application ($APPLI) is not defined.
1157 * The command is build with data from CatalogResources, in which every path
1158 * used on remote computer must be defined.
1160 //=============================================================================
1163 SALOME_ContainerManager::BuildTempFileToLaunchRemoteContainer
1164 (const string& machine,
1165 const Engines::MachineParameters& params) throw(SALOME_Exception)
1169 _TmpFileName = BuildTemporaryFileName();
1170 ofstream tempOutputFile;
1171 tempOutputFile.open(_TmpFileName.c_str(), ofstream::out );
1172 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine);
1173 tempOutputFile << "#! /bin/sh" << endl;
1177 tempOutputFile << "export SALOME_trace=local" << endl; // mkr : 27.11.2006 : PAL13967 - Distributed supervision graphs - Problem with "SALOME_trace"
1178 //tempOutputFile << "source " << resInfo.PreReqFilePath << endl;
1184 tempOutputFile << "mpirun -np ";
1187 if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) )
1189 else if ( params.nb_node == 0 )
1190 nbproc = params.nb_proc_per_node;
1191 else if ( params.nb_proc_per_node == 0 )
1192 nbproc = params.nb_node;
1194 nbproc = params.nb_node * params.nb_proc_per_node;
1196 std::ostringstream o;
1198 tempOutputFile << nbproc << " ";
1200 tempOutputFile << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
1201 #elif defined(WITHOPENMPI)
1202 if( getenv("OMPI_URI_FILE") == NULL )
1203 tempOutputFile << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
1205 tempOutputFile << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
1206 tempOutputFile << getenv("OMPI_URI_FILE");
1211 tempOutputFile << getenv("KERNEL_ROOT_DIR") << "/bin/salome/";
1215 if (isPythonContainer(params.container_name))
1216 tempOutputFile << " pyMPI SALOME_ContainerPy.py ";
1218 tempOutputFile << " SALOME_MPIContainer ";
1223 if (isPythonContainer(params.container_name))
1224 tempOutputFile << "SALOME_ContainerPy.py ";
1226 tempOutputFile << "SALOME_Container ";
1229 tempOutputFile << _NS->ContainerName(params) << " -";
1230 AddOmninamesParams(tempOutputFile);
1231 tempOutputFile << " &" << endl;
1232 tempOutputFile.flush();
1233 tempOutputFile.close();
1235 chmod(_TmpFileName.c_str(), 0x1ED);
1238 // --- Build command
1242 if (resInfo.Protocol == rsh)
1245 string commandRcp = "rcp ";
1246 commandRcp += _TmpFileName;
1248 commandRcp += machine;
1250 commandRcp += _TmpFileName;
1251 status = system(commandRcp.c_str());
1254 else if (resInfo.Protocol == ssh)
1257 string commandRcp = "scp ";
1258 commandRcp += _TmpFileName;
1260 commandRcp += machine;
1262 commandRcp += _TmpFileName;
1263 status = system(commandRcp.c_str());
1266 throw SALOME_Exception("Unknown protocol");
1269 throw SALOME_Exception("Error of connection on remote host");
1272 _CommandForRemAccess = command;
1274 command += _TmpFileName;
1282 //=============================================================================
1283 /*! Creates a command line that the container manager uses to launch
1284 * a parallel container.
1286 //=============================================================================
1288 SALOME_ContainerManager::BuildCommandToLaunchParallelContainer(const std::string& exe_name,
1289 const Engines::MachineParameters& params,
1290 SALOME_ContainerManager::actual_launch_machine_t & vect_machine,
1291 const std::string proxy_hostname)
1293 // This method knows the differences between the proxy and the nodes.
1294 // nb_component_nodes is not used in the same way if it is a proxy or
1297 //command = "gdb --args ";
1298 //command = "valgrind --tool=memcheck --log-file=val_log ";
1299 //command += real_exe_name;
1301 // Step 0 : init some variables...
1302 std::string parallelLib(CORBA::string_dup(params.parallelLib));
1303 std::string real_exe_name = exe_name + parallelLib;
1304 std::string machine_file_name("");
1305 bool remote = false;
1306 bool is_a_proxy = false;
1307 std::string hostname(CORBA::string_dup(params.hostname));
1309 std::ostringstream tmp_string;
1310 CORBA::Long nb_nodes = params.nb_component_nodes;
1311 tmp_string << nb_nodes;
1312 std::string nbproc = tmp_string.str();
1314 Engines::MachineParameters_var rtn = new Engines::MachineParameters();
1315 rtn->container_name = params.container_name;
1316 rtn->hostname = params.hostname;
1317 rtn->OS = params.OS;
1318 rtn->mem_mb = params.mem_mb;
1319 rtn->cpu_clock = params.cpu_clock;
1320 rtn->nb_proc_per_node = params.nb_proc_per_node;
1321 rtn->nb_node = params.nb_node;
1322 rtn->isMPI = params.isMPI;
1324 // Step 1 : local or remote launch ?
1325 if (hostname != std::string(Kernel_Utils::GetHostname()) )
1327 MESSAGE("[BuildCommandToLaunchParallelContainer] remote machine case detected !");
1331 // Step 2 : proxy or nodes launch ?
1332 std::string::size_type loc_proxy = exe_name.find("Proxy");
1333 if( loc_proxy != string::npos ) {
1337 // Step 3 : Depending of the parallelLib, getting the machine file
1338 // ParallelLib Dummy has is own machine for this method
1343 machine_file_name = _ResManager->getMachineFile(hostname,
1349 machine_file_name = _ResManager->getMachineFile(hostname,
1350 params.nb_component_nodes,
1353 if (machine_file_name == "")
1355 INFOS("[BuildCommandToLaunchParallelContainer] Error machine_file was not generated for machine " << hostname);
1356 throw SALOME_Exception("Error machine_file was not generated");
1358 MESSAGE("[BuildCommandToLaunchParallelContainer] machine_file_name is : " << machine_file_name);
1361 // Step 4 : Log type choosen by the user
1362 std::string log_env("");
1363 char * get_val = getenv("PARALLEL_LOG");
1366 std::string command_begin("");
1367 std::string command_end("");
1368 if(log_env == "xterm")
1370 command_begin = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH;";
1371 command_end = "\"&";
1373 else if(log_env == "xterm_debug")
1375 command_begin = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH;";
1376 command_end = "; cat \" &";
1380 // default into a file...
1381 std::string logFilename = "/tmp/" + _NS->ContainerName(params) + "_" + hostname;
1383 logFilename += "_Proxy_";
1385 logFilename += "_Node_";
1386 logFilename += std::string(getenv("USER")) + ".log";
1387 command_end = " > " + logFilename + " 2>&1 & ";
1390 // Step 5 : Building the command
1391 std::string command("");
1392 if (parallelLib == "Dummy")
1396 std::string command_remote("");
1399 std::string machine_name;
1400 std::ifstream machine_file(machine_file_name.c_str());
1401 std::getline(machine_file, machine_name);
1402 MESSAGE("[BuildCommandToLaunchParallelContainer] machine file name extracted is " << machine_name)
1404 // We want to launch a command like :
1405 // ssh user@machine distantPath/runRemote.sh hostNS portNS
1406 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine_name);
1407 if (resInfo.Protocol == rsh)
1408 command_remote = "rsh ";
1410 command_remote = "ssh ";
1411 command_remote += resInfo.UserName;
1412 command_remote += "@";
1413 command_remote += machine_name;
1414 command_remote += " ";
1415 command_remote += resInfo.AppliPath; // path relative to user@machine $HOME
1416 command_remote += "/runRemote.sh ";
1417 ASSERT(getenv("NSHOST"));
1418 command_remote += getenv("NSHOST"); // hostname of CORBA name server
1419 command_remote += " ";
1420 ASSERT(getenv("NSPORT"));
1421 command_remote += getenv("NSPORT"); // port of CORBA name server
1422 command_remote += " ";
1424 hostname = machine_name;
1427 command = real_exe_name;
1428 command += " " + _NS->ContainerName(rtn);
1429 command += " " + parallelLib;
1430 command += " " + hostname;
1431 command += " " + nbproc;
1433 AddOmninamesParams(command);
1435 command = command_begin + command_remote + command + command_end;
1436 vect_machine.push_back(hostname);
1440 std::ifstream * machine_file = NULL;
1442 machine_file = new std::ifstream(machine_file_name.c_str());
1443 for (int i= 0; i < nb_nodes; i++)
1445 std::string command_remote("");
1448 std::string machine_name;
1449 std::getline(*machine_file, machine_name);
1450 MESSAGE("[BuildCommandToLaunchParallelContainer] machine file name extracted is " << machine_name)
1452 // We want to launch a command like :
1453 // ssh user@machine distantPath/runRemote.sh hostNS portNS
1454 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine_name);
1455 if (resInfo.Protocol == rsh)
1456 command_remote = "rsh ";
1458 command_remote = "ssh ";
1459 command_remote += resInfo.UserName;
1460 command_remote += "@";
1461 command_remote += machine_name;
1462 command_remote += " ";
1463 command_remote += resInfo.AppliPath; // path relative to user@machine $HOME
1464 command_remote += "/runRemote.sh ";
1465 ASSERT(getenv("NSHOST"));
1466 command_remote += getenv("NSHOST"); // hostname of CORBA name server
1467 command_remote += " ";
1468 ASSERT(getenv("NSPORT"));
1469 command_remote += getenv("NSPORT"); // port of CORBA name server
1470 command_remote += " ";
1472 hostname = machine_name;
1475 std::ostringstream tmp;
1477 std::string proc_number = tmp.str();
1479 std::string command_tmp("");
1480 command_tmp += real_exe_name;
1481 command_tmp += " " + _NS->ContainerName(rtn);
1482 command_tmp += " " + parallelLib;
1483 command_tmp += " " + proxy_hostname;
1484 command_tmp += " " + proc_number;
1485 command_tmp += " -";
1486 AddOmninamesParams(command_tmp);
1488 // On change _Node_ par _Nodex_ pour avoir chaque noeud
1490 std::string command_end_tmp = command_end;
1491 std::string::size_type loc_node = command_end_tmp.find("_Node_");
1492 if (loc_node != std::string::npos)
1493 command_end_tmp.insert(loc_node+5, proc_number);
1494 command += command_begin + command_remote + command_tmp + command_end_tmp;
1495 vect_machine.push_back(hostname);
1498 delete machine_file;
1501 else if (parallelLib == "Mpi")
1503 // Step 0: if remote we have to copy the file
1504 // to the first machine of the file
1505 std::string remote_machine("");
1508 std::ifstream * machine_file = NULL;
1509 machine_file = new std::ifstream(machine_file_name.c_str());
1510 // Get first word of the line
1511 // For MPI implementation the first word is the
1513 std::getline(*machine_file, remote_machine, ' ');
1514 machine_file->close();
1515 MESSAGE("[BuildCommandToLaunchParallelContainer] machine file name extracted is " << remote_machine)
1517 // We want to launch a command like :
1518 // scp mpi_machine_file user@machine:Path
1519 std::string command_remote("");
1520 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(remote_machine);
1521 if (resInfo.Protocol == rsh)
1522 command_remote = "rcp ";
1524 command_remote = "scp ";
1526 command_remote += machine_file_name;
1527 command_remote += " ";
1528 command_remote += resInfo.UserName;
1529 command_remote += "@";
1530 command_remote += remote_machine;
1531 command_remote += ":";
1532 command_remote += machine_file_name;
1534 int status = system(command_remote.c_str());
1537 INFOS("copy of the mpi machine file failed !");
1544 std::string command_remote("");
1547 // We want to launch a command like :
1548 // ssh user@machine distantPath/runRemote.sh hostNS portNS
1549 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(remote_machine);
1550 if (resInfo.Protocol == rsh)
1551 command_remote = "rsh ";
1553 command_remote = "ssh ";
1554 command_remote += resInfo.UserName;
1555 command_remote += "@";
1556 command_remote += remote_machine;
1557 command_remote += " ";
1558 command_remote += resInfo.AppliPath; // path relative to user@machine $HOME
1559 command_remote += "/runRemote.sh ";
1560 ASSERT(getenv("NSHOST"));
1561 command_remote += getenv("NSHOST"); // hostname of CORBA name server
1562 command_remote += " ";
1563 ASSERT(getenv("NSPORT"));
1564 command_remote += getenv("NSPORT"); // port of CORBA name server
1565 command_remote += " ";
1567 hostname = remote_machine;
1570 // We use Dummy proxy for MPI parallel containers
1571 real_exe_name = exe_name + "Dummy";
1572 command = real_exe_name;
1573 command += " " + _NS->ContainerName(rtn);
1574 command += " Dummy";
1575 command += " " + hostname;
1576 command += " " + nbproc;
1578 AddOmninamesParams(command);
1580 command = command_begin + command_remote + command + command_end;
1581 vect_machine.push_back(hostname);
1585 std::string command_remote("");
1588 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(remote_machine);
1589 if (resInfo.Protocol == rsh)
1590 command_remote = "rsh ";
1592 command_remote = "ssh ";
1593 command_remote += resInfo.UserName;
1594 command_remote += "@";
1595 command_remote += remote_machine;
1596 command_remote += " ";
1598 std::string new_real_exe_name("");
1599 new_real_exe_name += resInfo.AppliPath; // path relative to user@machine $HOME
1600 new_real_exe_name += "/runRemote.sh ";
1601 ASSERT(getenv("NSHOST"));
1602 new_real_exe_name += getenv("NSHOST"); // hostname of CORBA name server
1603 new_real_exe_name += " ";
1604 ASSERT(getenv("NSPORT"));
1605 new_real_exe_name += getenv("NSPORT"); // port of CORBA name server
1606 new_real_exe_name += " ";
1608 real_exe_name = new_real_exe_name + real_exe_name;
1609 hostname = remote_machine;
1612 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(hostname);
1613 if (resInfo.mpi == lam)
1615 command = "mpiexec -ssi boot ";
1616 if (resInfo.Protocol == rsh)
1620 command += "-machinefile " + machine_file_name + " ";
1621 command += "-n " + nbproc + " ";
1622 command += real_exe_name;
1623 command += " " + _NS->ContainerName(rtn);
1624 command += " " + parallelLib;
1625 command += " " + proxy_hostname;
1627 AddOmninamesParams(command);
1631 command = "mpirun -np " + nbproc + " ";
1632 command += real_exe_name;
1633 command += " " + _NS->ContainerName(rtn);
1634 command += " " + parallelLib;
1635 command += " " + proxy_hostname;
1637 AddOmninamesParams(command);
1640 command = command_begin + command_remote + command + command_end;
1641 for (int i= 0; i < nb_nodes; i++)
1642 vect_machine.push_back(proxy_hostname);
1647 std::string message("Unknown parallelLib : " + parallelLib);
1648 throw SALOME_Exception(message.c_str());
1651 MESSAGE("Parallel launch is: " << command);
1655 string SALOME_ContainerManager::GetMPIZeroNode(string machine)
1660 string tmpFile = BuildTemporaryFileName();
1662 cmd = "ssh " + machine + " mpirun -np 1 hostname > " + tmpFile;
1664 status = system(cmd.c_str());
1666 ifstream fp(tmpFile.c_str(),ios::in);