1 // Copyright (C) 2007-2008 CEA/DEN, EDF R&D, OPEN CASCADE
3 // Copyright (C) 2003-2007 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
4 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU Lesser General Public
8 // License as published by the Free Software Foundation; either
9 // version 2.1 of the License.
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // Lesser General Public License for more details.
16 // You should have received a copy of the GNU Lesser General Public
17 // License along with this library; if not, write to the Free Software
18 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
22 #include "SALOME_ContainerManager.hxx"
23 #include "SALOME_NamingService.hxx"
24 #include "SALOME_ModuleCatalog.hh"
25 #include "Basics_Utils.hxx"
26 #include "Basics_DirUtils.hxx"
27 #include <sys/types.h>
33 #include "Utils_CorbaException.hxx"
34 #include "Batch_Date.hxx"
37 #ifdef WITH_PACO_PARALLEL
41 #define TIME_OUT_TO_LAUNCH_CONT 61
45 vector<Engines::Container_ptr> SALOME_ContainerManager::_batchLaunchedContainers;
47 vector<Engines::Container_ptr>::iterator SALOME_ContainerManager::_batchLaunchedContainersIter;
49 const char *SALOME_ContainerManager::_ContainerManagerNameInNS =
52 //=============================================================================
56 * Define a CORBA single thread policy for the server, which avoid to deal
57 * with non thread-safe usage like Change_Directory in SALOME naming service
59 //=============================================================================
61 SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, SALOME_ResourcesManager *rm, SALOME_NamingService *ns)
63 MESSAGE("constructor");
67 PortableServer::POAManager_var pman = poa->the_POAManager();
68 _orb = CORBA::ORB::_duplicate(orb) ;
69 CORBA::PolicyList policies;
71 PortableServer::ThreadPolicy_var threadPol =
72 poa->create_thread_policy(PortableServer::SINGLE_THREAD_MODEL);
73 policies[0] = PortableServer::ThreadPolicy::_duplicate(threadPol);
75 _poa = poa->create_POA("SThreadPOA",pman,policies);
77 PortableServer::ObjectId_var id = _poa->activate_object(this);
78 CORBA::Object_var obj = _poa->id_to_reference(id);
79 Engines::ContainerManager_var refContMan =
80 Engines::ContainerManager::_narrow(obj);
82 _NS->Register(refContMan,_ContainerManagerNameInNS);
83 _isAppliSalomeDefined = (getenv("APPLI") != 0);
84 MESSAGE("constructor end");
87 //=============================================================================
91 //=============================================================================
93 SALOME_ContainerManager::~SALOME_ContainerManager()
95 MESSAGE("destructor");
98 //=============================================================================
99 //! shutdown all the containers, then the ContainerManager servant
102 //=============================================================================
104 void SALOME_ContainerManager::Shutdown()
107 ShutdownContainers();
108 _NS->Destroy_Name(_ContainerManagerNameInNS);
109 PortableServer::ObjectId_var oid = _poa->servant_to_id(this);
110 _poa->deactivate_object(oid);
113 //=============================================================================
114 //! Loop on all the containers listed in naming service, ask shutdown on each
117 //=============================================================================
119 void SALOME_ContainerManager::ShutdownContainers()
121 MESSAGE("ShutdownContainers");
123 isOK = _NS->Change_Directory("/Containers");
125 vector<string> vec = _NS->list_directory_recurs();
126 list<string> lstCont;
127 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++)
130 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
133 Engines::Container_var cont=Engines::Container::_narrow(obj);
134 if(!CORBA::is_nil(cont))
135 lstCont.push_back((*iter));
137 catch(const CORBA::Exception& e)
139 // ignore this entry and continue
142 MESSAGE("Container list: ");
143 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++){
146 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++)
151 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
152 Engines::Container_var cont=Engines::Container::_narrow(obj);
153 if(!CORBA::is_nil(cont))
155 MESSAGE("ShutdownContainers: " << (*iter));
159 MESSAGE("ShutdownContainers: no container ref for " << (*iter));
161 catch(CORBA::SystemException& e)
163 INFOS("CORBA::SystemException ignored : " << e);
165 catch(CORBA::Exception&)
167 INFOS("CORBA::Exception ignored.");
171 INFOS("Unknown exception ignored.");
177 //=============================================================================
178 //! Give a suitable Container given constraints
180 * \param params Machine Parameters required for the container
181 * \return the container or nil
183 //=============================================================================
185 Engines::Container_ptr
186 SALOME_ContainerManager::GiveContainer(const Engines::MachineParameters& params)
188 char *valenv=getenv("SALOME_BATCH");
190 if (strcmp(valenv,"1")==0)
192 if(_batchLaunchedContainers.empty())
193 fillBatchLaunchedContainers();
195 if (_batchLaunchedContainersIter == _batchLaunchedContainers.end())
196 _batchLaunchedContainersIter = _batchLaunchedContainers.begin();
198 Engines::Container_ptr rtn = Engines::Container::_duplicate(*_batchLaunchedContainersIter);
199 _batchLaunchedContainersIter++;
202 return StartContainer(params);
205 //=============================================================================
206 //! Start a suitable Container in a list of machines with constraints
208 * Constraints are given by a machine parameters struct
209 * \param params Machine Parameters required for the container
210 * \param possibleComputers list of machines usable for start
211 * \param container_exe specific container executable (default=SALOME_Container)
213 //=============================================================================
215 Engines::Container_ptr
216 SALOME_ContainerManager::StartContainer(const Engines::MachineParameters& params,
217 const Engines::MachineList& possibleComputers,
218 const std::string& container_exe)
220 #ifdef WITH_PACO_PARALLEL
221 std::string parallelLib(params.parallelLib);
222 if (parallelLib != "")
224 Engines::MachineParameters myparams(params);
225 myparams.computerList=possibleComputers;
226 return StartParallelContainer(myparams);
229 string containerNameInNS;
230 Engines::Container_ptr ret = Engines::Container::_nil();
232 MESSAGE("SALOME_ContainerManager::StartContainer " << possibleComputers.length());
235 // if mode is "get" keep only machines with existing containers
236 if(std::string(params.mode.in())=="get")
238 for(unsigned int i=0;i<possibleComputers.length();i++)
240 Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
243 if(!cont->_non_existent())
244 lm.push_back(string(possibleComputers[i]));
246 catch(CORBA::Exception&)
248 // CORBA::Exception ignored.
254 for(unsigned int i=0;i<possibleComputers.length();i++)
255 lm.push_back(string(possibleComputers[i]));
261 theMachine=_ResManager->GetImpl()->Find(params.policy.in(),lm);
263 catch( const SALOME_Exception &ex )
266 return Engines::Container::_nil();
269 //If the machine name is localhost use the real name
270 if(theMachine == "localhost")
271 theMachine=Kernel_Utils::GetHostname();
273 //check if an entry exists in Naming service
274 //if params.mode == "start" or "" shutdown the existing container before launching a new one with that name
275 //if params.mode == "getorstart" or "get" use the existing container
276 containerNameInNS = _NS->BuildContainerNameForNS(params,theMachine.c_str());
278 SCRUTE(containerNameInNS);
279 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
280 if ( !CORBA::is_nil(obj) )
284 Engines::Container_var cont=Engines::Container::_narrow(obj);
285 if(!cont->_non_existent())
287 if(std::string(params.mode.in())=="getorstart"||std::string(params.mode.in())=="get")
288 return cont._retn(); /* the container exists and params.mode is getorstart or get use it*/
290 cont->Shutdown(); // shutdown the registered container if it exists
293 catch(CORBA::Exception&)
295 INFOS("CORBA::Exception ignored.");
299 //try to launch a new container
300 MESSAGE("try to launch it on " << theMachine);
304 MESSAGE("SALOME_ContainerManager::StartContainer : no possible computer");
305 return Engines::Container::_nil();
307 else if(theMachine==Kernel_Utils::GetHostname())
308 command = BuildCommandToLaunchLocalContainer(params,container_exe);
310 command = BuildCommandToLaunchRemoteContainer(theMachine,params,container_exe);
312 //redirect stdout and stderr in a file
313 string logFilename="/tmp/"+_NS->ContainerName(params)+"_"+ theMachine +"_"+getenv( "USER" )+".log" ;
314 command += " > " + logFilename + " 2>&1 &";
316 // launch container with a system call
317 int status=system(command.c_str());
320 MESSAGE("SALOME_ContainerManager::StartContainer rsh failed (system command status -1)");
321 RmTmpFile(_TmpFileName); // command file can be removed here
322 return Engines::Container::_nil();
324 else if (status == 217){
325 MESSAGE("SALOME_ContainerManager::StartContainer rsh failed (system command status 217)");
326 RmTmpFile(_TmpFileName); // command file can be removed here
327 return Engines::Container::_nil();
330 int count=TIME_OUT_TO_LAUNCH_CONT;
331 MESSAGE("count = "<<count);
332 while ( CORBA::is_nil(ret) && count ){
340 MESSAGE( count << ". Waiting for container on " << theMachine);
342 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
343 ret=Engines::Container::_narrow(obj);
346 if ( CORBA::is_nil(ret) )
348 MESSAGE("SALOME_ContainerManager::StartContainer rsh failed");
352 logFilename=":"+logFilename;
353 logFilename="@"+Kernel_Utils::GetHostname()+logFilename;
354 logFilename=getenv( "USER" )+logFilename;
355 ret->logfilename(logFilename.c_str());
358 RmTmpFile(_TmpFileName); // command file can be removed here
363 //=============================================================================
364 //! Start a suitable Container given constraints
366 * \param params Machine Parameters required for the container
368 //=============================================================================
370 Engines::Container_ptr
371 SALOME_ContainerManager::StartContainer(const Engines::MachineParameters& params)
373 Engines::MachineList_var possibleComputers = _ResManager->GetFittingResources(params);
375 // Look into ModulCatalog if a specific container must be launched
376 CORBA::String_var container_exe;
380 CORBA::Object_var obj = _NS->Resolve("/Kernel/ModulCatalog");
381 SALOME_ModuleCatalog::ModuleCatalog_var Catalog = SALOME_ModuleCatalog::ModuleCatalog::_narrow(obj) ;
382 if (CORBA::is_nil (Catalog))
383 return Engines::Container::_nil();
384 // Loop through component list
385 for(unsigned int i=0;i<params.componentList.length();i++)
387 const char* compoi = params.componentList[i];
388 SALOME_ModuleCatalog::Acomponent_var compoInfo = Catalog->GetComponent(compoi);
389 if (CORBA::is_nil (compoInfo))
393 SALOME_ModuleCatalog::ImplType impl=compoInfo->implementation_type();
394 container_exe=compoInfo->implementation_name();
395 if(impl==SALOME_ModuleCatalog::CEXE)
399 INFOS("ContainerManager Error: you can't have 2 CEXE component in the same container" );
400 return Engines::Container::_nil();
406 catch (ServiceUnreachable&)
408 INFOS("Caught exception: Naming Service Unreachable");
409 return Engines::Container::_nil();
413 INFOS("Caught unknown exception.");
414 return Engines::Container::_nil();
418 return StartContainer(params,possibleComputers,container_exe.in());
420 return StartContainer(params,possibleComputers);
423 //=============================================================================
424 //! Find or start a suitable Container given some constraints
426 * \param params Machine Parameters required for the container
427 * \return the container or nil
429 //=============================================================================
431 Engines::Container_ptr
432 SALOME_ContainerManager::FindOrStartContainer(const Engines::MachineParameters& params)
434 Engines::Container_ptr ret = FindContainer(params,params.computerList);
435 if(!CORBA::is_nil(ret))
437 MESSAGE("Container doesn't exist try to launch it ...");
439 return StartContainer(params);
442 //=============================================================================
443 //! Find a container given constraints (params) on a list of machines (possibleComputers)
447 //=============================================================================
449 Engines::Container_ptr
450 SALOME_ContainerManager::FindContainer(const Engines::MachineParameters& params,
451 const Engines::MachineList& possibleComputers)
453 MESSAGE("FindContainer "<<possibleComputers.length());
454 for(unsigned int i=0;i<possibleComputers.length();i++)
456 MESSAGE("FindContainer possible " << possibleComputers[i]);
457 Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
458 if( !CORBA::is_nil(cont) )
461 MESSAGE("FindContainer: not found");
462 return Engines::Container::_nil();
465 //=============================================================================
466 //! Find a container given constraints (params) on a machine (theMachine)
470 //=============================================================================
472 Engines::Container_ptr
473 SALOME_ContainerManager::FindContainer(const Engines::MachineParameters& params,
474 const char *theMachine)
476 string containerNameInNS(_NS->BuildContainerNameForNS(params,theMachine));
477 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
480 if(obj->_non_existent())
481 return Engines::Container::_nil();
483 return Engines::Container::_narrow(obj);
485 catch(const CORBA::Exception& e)
487 return Engines::Container::_nil();
491 #ifdef WITH_PACO_PARALLEL
492 //=============================================================================
494 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
495 * \param params Machine Parameters required for the container
496 * \return CORBA container reference.
498 //=============================================================================
499 Engines::Container_ptr
500 SALOME_ContainerManager::StartParallelContainer(const Engines::MachineParameters& params_const)
502 CORBA::Object_var obj;
503 PaCO::InterfaceManager_var container_proxy;
504 Engines::Container_ptr ret = Engines::Container::_nil();
505 Engines::MachineParameters params(params_const);
507 // Step 1 : Try to find a suitable container
508 // Currently not as good as could be since
509 // we have to verified the number of nodes of the container
510 // if a user tell that.
511 ret = FindContainer(params, params.computerList);
512 if(CORBA::is_nil(ret)) {
513 // Step 2 : Starting a new parallel container !
514 INFOS("[StartParallelContainer] Starting a PaCO++ parallel container");
516 // Step 3 : Choose a computer
517 std::string theMachine = _ResManager->FindFirst(params.computerList);
518 //If the machine name is localhost use the real name
519 if(theMachine == "localhost")
520 theMachine=Kernel_Utils::GetHostname();
522 if(theMachine == "") {
523 INFOS("[StartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
524 INFOS("[StartParallelContainer] No possible computer found");
525 INFOS("[StartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
528 INFOS("[StartParallelContainer] on machine : " << theMachine);
529 params.hostname = CORBA::string_dup(theMachine.c_str());
531 // Step 4 : starting parallel container proxy
532 Engines::MachineParameters params_proxy(params);
533 std::string command_proxy;
534 SALOME_ContainerManager::actual_launch_machine_t proxy_machine;
537 command_proxy = BuildCommandToLaunchParallelContainer("SALOME_ParallelContainerProxy", params_proxy, proxy_machine);
539 catch(const SALOME_Exception & ex)
541 INFOS("[StartParallelContainer] Exception in BuildCommandToLaunchParallelContainer");
545 params_proxy.nb_component_nodes = 0; // LaunchParallelContainer uses this value to know if it launches the proxy or the nodes
546 obj = LaunchParallelContainer(command_proxy, params_proxy, _NS->ContainerName(params_proxy), proxy_machine);
547 if (CORBA::is_nil(obj))
549 INFOS("[StartParallelContainer] LaunchParallelContainer for proxy returns NIL !");
554 container_proxy = PaCO::InterfaceManager::_narrow(obj);
556 catch(CORBA::SystemException& e)
558 INFOS("[StartParallelContainer] Exception in _narrow after LaunchParallelContainer for proxy !");
559 INFOS("CORBA::SystemException : " << e);
562 catch(CORBA::Exception& e)
564 INFOS("[StartParallelContainer] Exception in _narrow after LaunchParallelContainer for proxy !");
565 INFOS("CORBA::Exception" << e);
570 INFOS("[StartParallelContainer] Exception in _narrow after LaunchParallelContainer for proxy !");
571 INFOS("Unknown exception !");
574 if (CORBA::is_nil(container_proxy))
576 INFOS("[StartParallelContainer] PaCO::InterfaceManager::_narrow returns NIL !");
580 // Step 5 : starting parallel container nodes
581 std::string command_nodes;
582 Engines::MachineParameters params_nodes(params);
583 SALOME_ContainerManager::actual_launch_machine_t nodes_machines;
586 command_nodes = BuildCommandToLaunchParallelContainer("SALOME_ParallelContainerNode", params_nodes, nodes_machines, proxy_machine[0]);
588 catch(const SALOME_Exception & ex)
590 INFOS("[StartParallelContainer] Exception in BuildCommandToLaunchParallelContainer");
594 std::string container_generic_node_name = _NS->ContainerName(params) + "Node";
595 obj = LaunchParallelContainer(command_nodes, params_nodes, container_generic_node_name, nodes_machines);
596 if (CORBA::is_nil(obj))
598 INFOS("[StartParallelContainer] LaunchParallelContainer for nodes returns NIL !");
599 // Il faut tuer le proxy
602 Engines::Container_var proxy = Engines::Container::_narrow(container_proxy);
607 INFOS("[StartParallelContainer] Exception catched from proxy Shutdown...");
612 // Step 6 : connecting nodes and the proxy to actually create a parallel container
613 for (int i = 0; i < params.nb_component_nodes; i++)
615 std::ostringstream tmp;
617 std::string proc_number = tmp.str();
618 std::string container_node_name = container_generic_node_name + proc_number;
620 std::string theNodeMachine(nodes_machines[i]);
621 std::string containerNameInNS = _NS->BuildContainerNameForNS(container_node_name.c_str(), theNodeMachine.c_str());
622 obj = _NS->Resolve(containerNameInNS.c_str());
623 if (CORBA::is_nil(obj))
625 INFOS("[StartParallelContainer] CONNECTION FAILED From Naming Service !");
626 INFOS("[StartParallelContainer] Container name is " << containerNameInNS);
631 MESSAGE("[StartParallelContainer] Deploying node : " << container_node_name);
632 PaCO::InterfaceParallel_var node = PaCO::InterfaceParallel::_narrow(obj);
634 MESSAGE("[StartParallelContainer] node " << container_node_name << " is deployed");
636 catch(CORBA::SystemException& e)
638 INFOS("[StartParallelContainer] Exception in deploying node : " << containerNameInNS);
639 INFOS("CORBA::SystemException : " << e);
642 catch(CORBA::Exception& e)
644 INFOS("[StartParallelContainer] Exception in deploying node : " << containerNameInNS);
645 INFOS("CORBA::Exception" << e);
650 INFOS("[StartParallelContainer] Exception in deploying node : " << containerNameInNS);
651 INFOS("Unknown exception !");
656 // Step 7 : starting parallel container
659 MESSAGE ("[StartParallelContainer] Starting parallel object");
660 container_proxy->start();
661 MESSAGE ("[StartParallelContainer] Parallel object is started");
662 ret = Engines::Container::_narrow(container_proxy);
664 catch(CORBA::SystemException& e)
666 INFOS("Caught CORBA::SystemException. : " << e);
668 catch(PortableServer::POA::ServantAlreadyActive&)
670 INFOS("Caught CORBA::ServantAlreadyActiveException");
672 catch(CORBA::Exception&)
674 INFOS("Caught CORBA::Exception.");
676 catch(std::exception& exc)
678 INFOS("Caught std::exception - "<<exc.what());
682 INFOS("Caught unknown exception.");
688 //=============================================================================
690 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
691 * \param params Machine Parameters required for the container
692 * \return CORBA container reference.
694 //=============================================================================
695 Engines::Container_ptr
696 SALOME_ContainerManager::StartParallelContainer(const Engines::MachineParameters& params)
698 Engines::Container_ptr ret = Engines::Container::_nil();
699 INFOS("[StartParallelContainer] is disabled !");
700 INFOS("[StartParallelContainer] recompile SALOME Kernel to enable parallel extension");
705 //=============================================================================
706 /*! This method launches the parallel container.
707 * It will may be placed on the ressources manager.
709 * \param command to launch
710 * \param container's parameters
711 * \param name of the container
713 * \return CORBA container reference
715 //=============================================================================
717 SALOME_ContainerManager::LaunchParallelContainer(const std::string& command,
718 const Engines::MachineParameters& params,
719 const std::string& name,
720 SALOME_ContainerManager::actual_launch_machine_t & vect_machine)
722 CORBA::Object_ptr obj = CORBA::Object::_nil();
723 std::string containerNameInNS;
724 int count = TIME_OUT_TO_LAUNCH_CONT;
726 INFOS("[LaunchParallelContainer] Begin");
727 int status = system(command.c_str());
729 INFOS("[LaunchParallelContainer] failed : system command status -1");
732 else if (status == 217) {
733 INFOS("[LaunchParallelContainer] failed : system command status 217");
737 if (params.nb_component_nodes == 0)
739 std::string theMachine(vect_machine[0]);
740 // Proxy We have launch a proxy
741 containerNameInNS = _NS->BuildContainerNameForNS((char*) name.c_str(), theMachine.c_str());
742 INFOS("[LaunchParallelContainer] Waiting for Parallel Container proxy " << containerNameInNS << " on " << theMachine);
743 while (CORBA::is_nil(obj) && count)
751 obj = _NS->Resolve(containerNameInNS.c_str());
756 INFOS("[LaunchParallelContainer] launching the nodes of the parallel container");
757 // We are waiting all the nodes
758 for (int i = 0; i < params.nb_component_nodes; i++)
760 obj = CORBA::Object::_nil();
761 std::string theMachine(vect_machine[i]);
763 std::ostringstream tmp;
765 std::string proc_number = tmp.str();
766 std::string container_node_name = name + proc_number;
767 containerNameInNS = _NS->BuildContainerNameForNS((char*) container_node_name.c_str(), theMachine.c_str());
768 INFOS("[LaunchParallelContainer] Waiting for Parallel Container node " << containerNameInNS << " on " << theMachine);
769 while (CORBA::is_nil(obj) && count) {
776 obj = _NS->Resolve(containerNameInNS.c_str());
778 if (CORBA::is_nil(obj))
780 INFOS("[LaunchParallelContainer] Launch of node failed (or not found) !");
785 if (CORBA::is_nil(obj))
786 INFOS("[LaunchParallelContainer] failed");
791 void SALOME_ContainerManager::fillBatchLaunchedContainers()
793 _batchLaunchedContainers.clear();
794 _NS->Change_Directory("/Containers");
795 vector<string> vec = _NS->list_directory_recurs();
796 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++){
797 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
798 Engines::Container_ptr cont=Engines::Container::_narrow(obj);
799 if(!CORBA::is_nil(cont)){
800 _batchLaunchedContainers.push_back(cont);
803 _batchLaunchedContainersIter=_batchLaunchedContainers.begin();
806 //=============================================================================
808 * This is no longer valid (C++ container are also python containers)
810 //=============================================================================
812 bool isPythonContainer(const char* ContainerName)
815 int len = strlen(ContainerName);
818 if (strcmp(ContainerName + len - 2, "Py") == 0)
824 //=============================================================================
826 * Builds the script to be launched
828 * If SALOME Application not defined ($APPLI),
829 * see BuildTempFileToLaunchRemoteContainer()
831 * Else rely on distant configuration. Command is under the form (example):
832 * ssh user@machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \
833 * SALOME_Container containerName &"
835 * - where user is ommited if not specified in CatalogResources,
836 * - where distant path is always relative to user@machine $HOME, and
837 * equal to $APPLI if not specified in CatalogResources,
838 * - where hostNS is the hostname of CORBA naming server (set by scripts to
839 * use to launch SALOME and servers in $APPLI: runAppli.sh, runRemote.sh)
840 * - where portNS is the port used by CORBA naming server (set by scripts to
841 * use to launch SALOME and servers in $APPLI: runAppli.sh, runRemote.sh)
842 * - where workingdir is the requested working directory for the container.
843 * If WORKINGDIR (and workingdir) is not present the working dir will be $HOME
845 //=============================================================================
848 SALOME_ContainerManager::BuildCommandToLaunchRemoteContainer
849 (const string& machine,
850 const Engines::MachineParameters& params, const std::string& container_exe)
855 if ( ! _isAppliSalomeDefined )
856 command = BuildTempFileToLaunchRemoteContainer(machine, params);
860 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine);
864 if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) )
866 else if ( params.nb_node == 0 )
867 nbproc = params.nb_proc_per_node;
868 else if ( params.nb_proc_per_node == 0 )
869 nbproc = params.nb_node;
871 nbproc = params.nb_node * params.nb_proc_per_node;
874 // "ssh user@machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \
875 // SALOME_Container containerName &"
877 if (resInfo.Protocol == rsh)
879 else if (resInfo.Protocol == ssh)
882 throw SALOME_Exception("Unknown protocol");
884 if (resInfo.UserName != "")
886 command += resInfo.UserName;
893 if (resInfo.AppliPath != "")
894 command += resInfo.AppliPath; // path relative to user@machine $HOME
897 ASSERT(getenv("APPLI"));
898 command += getenv("APPLI"); // path relative to user@machine $HOME
901 command += "/runRemote.sh ";
903 ASSERT(getenv("NSHOST"));
904 command += getenv("NSHOST"); // hostname of CORBA name server
907 ASSERT(getenv("NSPORT"));
908 command += getenv("NSPORT"); // port of CORBA name server
910 std::string wdir=params.workingdir.in();
913 command += " WORKINGDIR ";
915 if(wdir == "$TEMPDIR")
917 command += wdir; // requested working directory
923 command += " mpirun -np ";
924 std::ostringstream o;
928 command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
929 #elif defined(WITHOPENMPI)
930 if( getenv("OMPI_URI_FILE") == NULL )
931 command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
933 command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
934 command += getenv("OMPI_URI_FILE");
937 command += " SALOME_MPIContainer ";
940 command += " " +container_exe+ " ";
942 command += _NS->ContainerName(params);
944 AddOmninamesParams(command);
946 MESSAGE("command =" << command);
952 //=============================================================================
954 * builds the command to be launched.
956 //=============================================================================
959 SALOME_ContainerManager::BuildCommandToLaunchLocalContainer
960 (const Engines::MachineParameters& params, const std::string& container_exe)
962 _TmpFileName = BuildTemporaryFileName();
966 ofstream command_file( _TmpFileName.c_str() );
970 //command = "mpirun -np ";
971 command_file << "mpirun -np ";
973 if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) )
975 else if ( params.nb_node == 0 )
976 nbproc = params.nb_proc_per_node;
977 else if ( params.nb_proc_per_node == 0 )
978 nbproc = params.nb_node;
980 nbproc = params.nb_node * params.nb_proc_per_node;
982 //std::ostringstream o;
984 //o << nbproc << " ";
985 command_file << nbproc << " ";
987 //command += o.str();
989 //command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
990 command_file << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
991 #elif defined(WITHOPENMPI)
992 //command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace ";
993 if( getenv("OMPI_URI_FILE") == NULL )
994 command_file << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
997 command_file << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
998 command_file << getenv("OMPI_URI_FILE");
1002 if (isPythonContainer(params.container_name))
1003 //command += "pyMPI SALOME_ContainerPy.py ";
1004 command_file << " pyMPI SALOME_ContainerPy.py ";
1006 //command += "SALOME_MPIContainer ";
1007 command_file << " SALOME_MPIContainer ";
1013 std::string wdir=params.workingdir.in();
1016 // a working directory is requested
1017 if(wdir == "$TEMPDIR")
1019 // a new temporary directory is requested
1020 string dir = Kernel_Utils::GetTmpDir();
1022 //command += "cd /d "+ dir +";";
1023 command_file << "cd /d " << dir << endl;
1025 //command = "cd "+ dir +";";
1026 command_file << "cd " << dir << ";";
1032 // a permanent directory is requested use it or create it
1034 //command="mkdir " + wdir;
1035 command_file << "mkdir " + wdir << endl;
1036 command_file << "cd /D " + wdir << endl;
1038 //command="mkdir -p " + wdir + " && cd " + wdir + ";";
1039 command_file << "mkdir -p " << wdir << " && cd " << wdir + ";";
1043 if (isPythonContainer(params.container_name))
1044 //command += "SALOME_ContainerPy.py ";
1045 command_file << "SALOME_ContainerPy.py ";
1047 //command += container_exe + " ";
1048 command_file << container_exe + " ";
1052 command_file << _NS->ContainerName(params);
1053 command_file << " -";
1054 AddOmninamesParams(command_file);
1055 command_file.close();
1058 chmod(_TmpFileName.c_str(), 0x1ED);
1060 command = _TmpFileName;
1062 MESSAGE("Command is file ... " << command);
1067 //=============================================================================
1069 * removes the generated temporary file in case of a remote launch.
1071 //=============================================================================
1073 void SALOME_ContainerManager::RmTmpFile(std::string& tmpFileName)
1075 int lenght = tmpFileName.size();
1079 string command = "del /F ";
1081 string command = "rm ";
1084 command += tmpFileName.substr(0, lenght - 3 );
1086 command += tmpFileName;
1088 system(command.c_str());
1089 //if dir is empty - remove it
1090 string tmp_dir = Kernel_Utils::GetDirByPath( tmpFileName );
1091 if ( Kernel_Utils::IsEmptyDir( tmp_dir ) )
1094 command = "del /F " + tmp_dir;
1096 command = "rmdir " + tmp_dir;
1098 system(command.c_str());
1103 //=============================================================================
1105 * add to command all options relative to naming service.
1107 //=============================================================================
1109 void SALOME_ContainerManager::AddOmninamesParams(string& command) const
1111 CORBA::String_var iorstr = _NS->getIORaddr();
1112 command += "ORBInitRef NameService=";
1117 //=============================================================================
1119 * add to command all options relative to naming service.
1121 //=============================================================================
1123 void SALOME_ContainerManager::AddOmninamesParams(ofstream& fileStream) const
1125 CORBA::String_var iorstr = _NS->getIORaddr();
1126 fileStream << "ORBInitRef NameService=";
1127 fileStream << iorstr;
1130 //=============================================================================
1132 * generate a file name in /tmp directory
1134 //=============================================================================
1136 string SALOME_ContainerManager::BuildTemporaryFileName() const
1138 //build more complex file name to support multiple salome session
1139 string aFileName = Kernel_Utils::GetTmpFileName();
1143 aFileName += ".bat";
1149 //=============================================================================
1151 * Builds in a temporary file the script to be launched.
1153 * Used if SALOME Application ($APPLI) is not defined.
1154 * The command is build with data from CatalogResources, in which every path
1155 * used on remote computer must be defined.
1157 //=============================================================================
1160 SALOME_ContainerManager::BuildTempFileToLaunchRemoteContainer
1161 (const string& machine,
1162 const Engines::MachineParameters& params) throw(SALOME_Exception)
1166 _TmpFileName = BuildTemporaryFileName();
1167 ofstream tempOutputFile;
1168 tempOutputFile.open(_TmpFileName.c_str(), ofstream::out );
1169 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine);
1170 tempOutputFile << "#! /bin/sh" << endl;
1174 tempOutputFile << "export SALOME_trace=local" << endl; // mkr : 27.11.2006 : PAL13967 - Distributed supervision graphs - Problem with "SALOME_trace"
1175 //tempOutputFile << "source " << resInfo.PreReqFilePath << endl;
1181 tempOutputFile << "mpirun -np ";
1184 if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) )
1186 else if ( params.nb_node == 0 )
1187 nbproc = params.nb_proc_per_node;
1188 else if ( params.nb_proc_per_node == 0 )
1189 nbproc = params.nb_node;
1191 nbproc = params.nb_node * params.nb_proc_per_node;
1193 std::ostringstream o;
1195 tempOutputFile << nbproc << " ";
1197 tempOutputFile << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
1198 #elif defined(WITHOPENMPI)
1199 if( getenv("OMPI_URI_FILE") == NULL )
1200 tempOutputFile << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
1202 tempOutputFile << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
1203 tempOutputFile << getenv("OMPI_URI_FILE");
1208 tempOutputFile << getenv("KERNEL_ROOT_DIR") << "/bin/salome/";
1212 if (isPythonContainer(params.container_name))
1213 tempOutputFile << " pyMPI SALOME_ContainerPy.py ";
1215 tempOutputFile << " SALOME_MPIContainer ";
1220 if (isPythonContainer(params.container_name))
1221 tempOutputFile << "SALOME_ContainerPy.py ";
1223 tempOutputFile << "SALOME_Container ";
1226 tempOutputFile << _NS->ContainerName(params) << " -";
1227 AddOmninamesParams(tempOutputFile);
1228 tempOutputFile << " &" << endl;
1229 tempOutputFile.flush();
1230 tempOutputFile.close();
1232 chmod(_TmpFileName.c_str(), 0x1ED);
1235 // --- Build command
1239 if (resInfo.Protocol == rsh)
1242 string commandRcp = "rcp ";
1243 commandRcp += _TmpFileName;
1245 commandRcp += machine;
1247 commandRcp += _TmpFileName;
1248 status = system(commandRcp.c_str());
1251 else if (resInfo.Protocol == ssh)
1254 string commandRcp = "scp ";
1255 commandRcp += _TmpFileName;
1257 commandRcp += machine;
1259 commandRcp += _TmpFileName;
1260 status = system(commandRcp.c_str());
1263 throw SALOME_Exception("Unknown protocol");
1266 throw SALOME_Exception("Error of connection on remote host");
1269 _CommandForRemAccess = command;
1271 command += _TmpFileName;
1279 //=============================================================================
1280 /*! Creates a command line that the container manager uses to launch
1281 * a parallel container.
1283 //=============================================================================
1285 SALOME_ContainerManager::BuildCommandToLaunchParallelContainer(const std::string& exe_name,
1286 const Engines::MachineParameters& params,
1287 SALOME_ContainerManager::actual_launch_machine_t & vect_machine,
1288 const std::string proxy_hostname)
1290 // This method knows the differences between the proxy and the nodes.
1291 // nb_component_nodes is not used in the same way if it is a proxy or
1294 //command = "gdb --args ";
1295 //command = "valgrind --tool=memcheck --log-file=val_log ";
1296 //command += real_exe_name;
1298 // Step 0 : init some variables...
1299 std::string parallelLib(CORBA::string_dup(params.parallelLib));
1300 std::string real_exe_name = exe_name + parallelLib;
1301 std::string machine_file_name("");
1302 bool remote = false;
1303 bool is_a_proxy = false;
1304 std::string hostname(CORBA::string_dup(params.hostname));
1306 std::ostringstream tmp_string;
1307 CORBA::Long nb_nodes = params.nb_component_nodes;
1308 tmp_string << nb_nodes;
1309 std::string nbproc = tmp_string.str();
1311 Engines::MachineParameters_var rtn = new Engines::MachineParameters();
1312 rtn->container_name = params.container_name;
1313 rtn->hostname = params.hostname;
1314 rtn->OS = params.OS;
1315 rtn->mem_mb = params.mem_mb;
1316 rtn->cpu_clock = params.cpu_clock;
1317 rtn->nb_proc_per_node = params.nb_proc_per_node;
1318 rtn->nb_node = params.nb_node;
1319 rtn->isMPI = params.isMPI;
1321 // Step 1 : local or remote launch ?
1322 if (hostname != std::string(Kernel_Utils::GetHostname()) )
1324 MESSAGE("[BuildCommandToLaunchParallelContainer] remote machine case detected !");
1328 // Step 2 : proxy or nodes launch ?
1329 std::string::size_type loc_proxy = exe_name.find("Proxy");
1330 if( loc_proxy != string::npos ) {
1334 // Step 3 : Depending of the parallelLib, getting the machine file
1335 // ParallelLib Dummy has is own machine for this method
1340 machine_file_name = _ResManager->getMachineFile(hostname,
1346 machine_file_name = _ResManager->getMachineFile(hostname,
1347 params.nb_component_nodes,
1350 if (machine_file_name == "")
1352 INFOS("[BuildCommandToLaunchParallelContainer] Error machine_file was not generated for machine " << hostname);
1353 throw SALOME_Exception("Error machine_file was not generated");
1355 MESSAGE("[BuildCommandToLaunchParallelContainer] machine_file_name is : " << machine_file_name);
1358 // Step 4 : Log type choosen by the user
1359 std::string log_env("");
1360 char * get_val = getenv("PARALLEL_LOG");
1363 std::string command_begin("");
1364 std::string command_end("");
1365 if(log_env == "xterm")
1367 command_begin = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH;";
1368 command_end = "\"&";
1370 else if(log_env == "xterm_debug")
1372 command_begin = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH;";
1373 command_end = "; cat \" &";
1377 // default into a file...
1378 std::string logFilename = "/tmp/" + _NS->ContainerName(params) + "_" + hostname;
1380 logFilename += "_Proxy_";
1382 logFilename += "_Node_";
1383 logFilename += std::string(getenv("USER")) + ".log";
1384 command_end = " > " + logFilename + " 2>&1 & ";
1387 // Step 5 : Building the command
1388 std::string command("");
1389 if (parallelLib == "Dummy")
1393 std::string command_remote("");
1396 std::string machine_name;
1397 std::ifstream machine_file(machine_file_name.c_str());
1398 std::getline(machine_file, machine_name);
1399 MESSAGE("[BuildCommandToLaunchParallelContainer] machine file name extracted is " << machine_name)
1401 // We want to launch a command like :
1402 // ssh user@machine distantPath/runRemote.sh hostNS portNS
1403 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine_name);
1404 if (resInfo.Protocol == rsh)
1405 command_remote = "rsh ";
1407 command_remote = "ssh ";
1408 command_remote += resInfo.UserName;
1409 command_remote += "@";
1410 command_remote += machine_name;
1411 command_remote += " ";
1412 command_remote += resInfo.AppliPath; // path relative to user@machine $HOME
1413 command_remote += "/runRemote.sh ";
1414 ASSERT(getenv("NSHOST"));
1415 command_remote += getenv("NSHOST"); // hostname of CORBA name server
1416 command_remote += " ";
1417 ASSERT(getenv("NSPORT"));
1418 command_remote += getenv("NSPORT"); // port of CORBA name server
1419 command_remote += " ";
1421 hostname = machine_name;
1424 command = real_exe_name;
1425 command += " " + _NS->ContainerName(rtn);
1426 command += " " + parallelLib;
1427 command += " " + hostname;
1428 command += " " + nbproc;
1430 AddOmninamesParams(command);
1432 command = command_begin + command_remote + command + command_end;
1433 vect_machine.push_back(hostname);
1437 std::ifstream * machine_file = NULL;
1439 machine_file = new std::ifstream(machine_file_name.c_str());
1440 for (int i= 0; i < nb_nodes; i++)
1442 std::string command_remote("");
1445 std::string machine_name;
1446 std::getline(*machine_file, machine_name);
1447 MESSAGE("[BuildCommandToLaunchParallelContainer] machine file name extracted is " << machine_name)
1449 // We want to launch a command like :
1450 // ssh user@machine distantPath/runRemote.sh hostNS portNS
1451 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine_name);
1452 if (resInfo.Protocol == rsh)
1453 command_remote = "rsh ";
1455 command_remote = "ssh ";
1456 command_remote += resInfo.UserName;
1457 command_remote += "@";
1458 command_remote += machine_name;
1459 command_remote += " ";
1460 command_remote += resInfo.AppliPath; // path relative to user@machine $HOME
1461 command_remote += "/runRemote.sh ";
1462 ASSERT(getenv("NSHOST"));
1463 command_remote += getenv("NSHOST"); // hostname of CORBA name server
1464 command_remote += " ";
1465 ASSERT(getenv("NSPORT"));
1466 command_remote += getenv("NSPORT"); // port of CORBA name server
1467 command_remote += " ";
1469 hostname = machine_name;
1472 std::ostringstream tmp;
1474 std::string proc_number = tmp.str();
1476 std::string command_tmp("");
1477 command_tmp += real_exe_name;
1478 command_tmp += " " + _NS->ContainerName(rtn);
1479 command_tmp += " " + parallelLib;
1480 command_tmp += " " + proxy_hostname;
1481 command_tmp += " " + proc_number;
1482 command_tmp += " -";
1483 AddOmninamesParams(command_tmp);
1485 // On change _Node_ par _Nodex_ pour avoir chaque noeud
1487 std::string command_end_tmp = command_end;
1488 std::string::size_type loc_node = command_end_tmp.find("_Node_");
1489 if (loc_node != std::string::npos)
1490 command_end_tmp.insert(loc_node+5, proc_number);
1491 command += command_begin + command_remote + command_tmp + command_end_tmp;
1492 vect_machine.push_back(hostname);
1495 delete machine_file;
1498 else if (parallelLib == "Mpi")
1500 // Step 0: if remote we have to copy the file
1501 // to the first machine of the file
1502 std::string remote_machine("");
1505 std::ifstream * machine_file = NULL;
1506 machine_file = new std::ifstream(machine_file_name.c_str());
1507 // Get first word of the line
1508 // For MPI implementation the first word is the
1510 std::getline(*machine_file, remote_machine, ' ');
1511 machine_file->close();
1512 MESSAGE("[BuildCommandToLaunchParallelContainer] machine file name extracted is " << remote_machine)
1514 // We want to launch a command like :
1515 // scp mpi_machine_file user@machine:Path
1516 std::string command_remote("");
1517 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(remote_machine);
1518 if (resInfo.Protocol == rsh)
1519 command_remote = "rcp ";
1521 command_remote = "scp ";
1523 command_remote += machine_file_name;
1524 command_remote += " ";
1525 command_remote += resInfo.UserName;
1526 command_remote += "@";
1527 command_remote += remote_machine;
1528 command_remote += ":";
1529 command_remote += machine_file_name;
1531 int status = system(command_remote.c_str());
1534 INFOS("copy of the mpi machine file failed !");
1541 std::string command_remote("");
1544 // We want to launch a command like :
1545 // ssh user@machine distantPath/runRemote.sh hostNS portNS
1546 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(remote_machine);
1547 if (resInfo.Protocol == rsh)
1548 command_remote = "rsh ";
1550 command_remote = "ssh ";
1551 command_remote += resInfo.UserName;
1552 command_remote += "@";
1553 command_remote += remote_machine;
1554 command_remote += " ";
1555 command_remote += resInfo.AppliPath; // path relative to user@machine $HOME
1556 command_remote += "/runRemote.sh ";
1557 ASSERT(getenv("NSHOST"));
1558 command_remote += getenv("NSHOST"); // hostname of CORBA name server
1559 command_remote += " ";
1560 ASSERT(getenv("NSPORT"));
1561 command_remote += getenv("NSPORT"); // port of CORBA name server
1562 command_remote += " ";
1564 hostname = remote_machine;
1567 // We use Dummy proxy for MPI parallel containers
1568 real_exe_name = exe_name + "Dummy";
1569 command = real_exe_name;
1570 command += " " + _NS->ContainerName(rtn);
1571 command += " Dummy";
1572 command += " " + hostname;
1573 command += " " + nbproc;
1575 AddOmninamesParams(command);
1577 command = command_begin + command_remote + command + command_end;
1578 vect_machine.push_back(hostname);
1582 std::string command_remote("");
1585 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(remote_machine);
1586 if (resInfo.Protocol == rsh)
1587 command_remote = "rsh ";
1589 command_remote = "ssh ";
1590 command_remote += resInfo.UserName;
1591 command_remote += "@";
1592 command_remote += remote_machine;
1593 command_remote += " ";
1595 std::string new_real_exe_name("");
1596 new_real_exe_name += resInfo.AppliPath; // path relative to user@machine $HOME
1597 new_real_exe_name += "/runRemote.sh ";
1598 ASSERT(getenv("NSHOST"));
1599 new_real_exe_name += getenv("NSHOST"); // hostname of CORBA name server
1600 new_real_exe_name += " ";
1601 ASSERT(getenv("NSPORT"));
1602 new_real_exe_name += getenv("NSPORT"); // port of CORBA name server
1603 new_real_exe_name += " ";
1605 real_exe_name = new_real_exe_name + real_exe_name;
1606 hostname = remote_machine;
1609 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(hostname);
1610 if (resInfo.mpi == lam)
1612 command = "mpiexec -ssi boot ";
1613 if (resInfo.Protocol == rsh)
1617 command += "-machinefile " + machine_file_name + " ";
1618 command += "-n " + nbproc + " ";
1619 command += real_exe_name;
1620 command += " " + _NS->ContainerName(rtn);
1621 command += " " + parallelLib;
1622 command += " " + proxy_hostname;
1624 AddOmninamesParams(command);
1628 command = "mpirun -np " + nbproc + " ";
1629 command += real_exe_name;
1630 command += " " + _NS->ContainerName(rtn);
1631 command += " " + parallelLib;
1632 command += " " + proxy_hostname;
1634 AddOmninamesParams(command);
1637 command = command_begin + command_remote + command + command_end;
1638 for (int i= 0; i < nb_nodes; i++)
1639 vect_machine.push_back(proxy_hostname);
1644 std::string message("Unknown parallelLib : " + parallelLib);
1645 throw SALOME_Exception(message.c_str());
1648 MESSAGE("Parallel launch is: " << command);
1652 string SALOME_ContainerManager::GetMPIZeroNode(string machine)
1657 string tmpFile = BuildTemporaryFileName();
1659 cmd = "ssh " + machine + " mpirun -np 1 hostname > " + tmpFile;
1661 status = system(cmd.c_str());
1663 ifstream fp(tmpFile.c_str(),ios::in);