1 // Copyright (C) 2007-2008 CEA/DEN, EDF R&D, OPEN CASCADE
3 // Copyright (C) 2003-2007 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
4 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU Lesser General Public
8 // License as published by the Free Software Foundation; either
9 // version 2.1 of the License.
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // Lesser General Public License for more details.
16 // You should have received a copy of the GNU Lesser General Public
17 // License along with this library; if not, write to the Free Software
18 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
22 #include "SALOME_ContainerManager.hxx"
23 #include "SALOME_NamingService.hxx"
24 #include "SALOME_ModuleCatalog.hh"
25 #include "Basics_Utils.hxx"
26 #include "Basics_DirUtils.hxx"
27 #include <sys/types.h>
33 #include "Utils_CorbaException.hxx"
34 #include "Batch_Date.hxx"
36 #ifdef WITH_PACO_PARALLEL
40 #define TIME_OUT_TO_LAUNCH_CONT 61
44 vector<Engines::Container_ptr> SALOME_ContainerManager::_batchLaunchedContainers;
46 vector<Engines::Container_ptr>::iterator SALOME_ContainerManager::_batchLaunchedContainersIter;
48 const char *SALOME_ContainerManager::_ContainerManagerNameInNS =
51 //=============================================================================
55 * Define a CORBA single thread policy for the server, which avoid to deal
56 * with non thread-safe usage like Change_Directory in SALOME naming service
58 //=============================================================================
60 SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, SALOME_ResourcesManager *rm, SALOME_NamingService *ns)
62 MESSAGE("constructor");
66 PortableServer::POAManager_var pman = poa->the_POAManager();
67 _orb = CORBA::ORB::_duplicate(orb) ;
68 CORBA::PolicyList policies;
70 PortableServer::ThreadPolicy_var threadPol =
71 poa->create_thread_policy(PortableServer::SINGLE_THREAD_MODEL);
72 policies[0] = PortableServer::ThreadPolicy::_duplicate(threadPol);
74 _poa = poa->create_POA("SThreadPOA",pman,policies);
76 PortableServer::ObjectId_var id = _poa->activate_object(this);
77 CORBA::Object_var obj = _poa->id_to_reference(id);
78 Engines::ContainerManager_var refContMan =
79 Engines::ContainerManager::_narrow(obj);
81 _NS->Register(refContMan,_ContainerManagerNameInNS);
83 _isAppliSalomeDefined = (getenv("APPLI") != 0);
84 MESSAGE("constructor end");
87 //=============================================================================
91 //=============================================================================
93 SALOME_ContainerManager::~SALOME_ContainerManager()
95 MESSAGE("destructor");
98 //=============================================================================
100 * shutdown all the containers, then the ContainerManager servant
102 //=============================================================================
104 void SALOME_ContainerManager::Shutdown()
107 ShutdownContainers();
108 _NS->Destroy_Name(_ContainerManagerNameInNS);
109 PortableServer::ObjectId_var oid = _poa->servant_to_id(this);
110 _poa->deactivate_object(oid);
113 //=============================================================================
115 * Loop on all the containers listed in naming service, ask shutdown on each
117 //=============================================================================
119 void SALOME_ContainerManager::ShutdownContainers()
121 MESSAGE("ShutdownContainers");
123 isOK = _NS->Change_Directory("/Containers");
125 vector<string> vec = _NS->list_directory_recurs();
126 list<string> lstCont;
127 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++)
130 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
133 Engines::Container_var cont=Engines::Container::_narrow(obj);
134 if(!CORBA::is_nil(cont))
135 lstCont.push_back((*iter));
137 catch(const CORBA::Exception& e)
139 // ignore this entry and continue
142 MESSAGE("Container list: ");
143 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++){
146 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++){
148 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
149 Engines::Container_var cont=Engines::Container::_narrow(obj);
150 if(!CORBA::is_nil(cont))
152 MESSAGE("ShutdownContainers: " << (*iter));
157 catch(CORBA::SystemException& e)
159 INFOS("CORBA::SystemException ignored : " << e);
161 catch(CORBA::Exception&)
163 INFOS("CORBA::Exception ignored.");
167 INFOS("Unknown exception ignored.");
171 MESSAGE("ShutdownContainers: no container ref for " << (*iter));
176 //=============================================================================
177 //! Find a suitable Container in a list of machines, or start one
179 * \param params Machine Parameters required for the container
180 * \param possibleComputers list of machines usable for find or start
182 //=============================================================================
184 Engines::Container_ptr
185 SALOME_ContainerManager::
186 FindOrStartContainer(const Engines::MachineParameters& params,
187 const Engines::MachineList& possibleComputers)
189 Engines::Container_ptr ret = FindContainer(params,possibleComputers);
190 if(!CORBA::is_nil(ret))
192 MESSAGE("Container doesn't exist try to launch it ...");
194 return StartContainer(params,possibleComputers,Engines::P_FIRST);
198 //=============================================================================
199 //! Start a suitable Container in a list of machines with constraints and a policy
201 * Constraints are given by a machine parameters struct
202 * \param params Machine Parameters required for the container
203 * \param possibleComputers list of machines usable for start
204 * \param policy policy to use (first,cycl or best)
205 * \param container_exe specific container executable (default=SALOME_Container)
207 //=============================================================================
209 Engines::Container_ptr
210 SALOME_ContainerManager::
211 StartContainer(const Engines::MachineParameters& params,
212 const Engines::MachineList& possibleComputers,
213 Engines::ResPolicy policy,const std::string& container_exe)
215 #ifdef WITH_PACO_PARALLEL
216 std::string parallelLib(params.parallelLib);
217 if (parallelLib != "")
218 return FindOrStartParallelContainer(params, possibleComputers);
220 string containerNameInNS;
221 Engines::Container_ptr ret = Engines::Container::_nil();
223 MESSAGE("SALOME_ContainerManager::StartContainer " << possibleComputers.length());
226 // if mode is "get" keep only machines with existing containers
227 if(std::string(params.mode.in())=="get")
229 for(unsigned int i=0;i<possibleComputers.length();i++)
231 Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
234 if(!cont->_non_existent())
235 lm.push_back(string(possibleComputers[i]));
237 catch(CORBA::Exception&)
239 // CORBA::Exception ignored.
245 for(unsigned int i=0;i<possibleComputers.length();i++)
246 lm.push_back(string(possibleComputers[i]));
252 theMachine=_ResManager->GetImpl()->Find(params.policy.in(),lm);
254 catch( const SALOME_Exception &ex )
257 return Engines::Container::_nil();
260 //If the machine name is localhost use the real name
261 if(theMachine == "localhost")
262 theMachine=Kernel_Utils::GetHostname();
264 //check if an entry exists in Naming service
265 //if params.mode == "start" or "" shutdown the existing container before launching a new one with that name
266 //if params.mode == "getorstart" or "get" use the existing container
267 containerNameInNS = _NS->BuildContainerNameForNS(params,theMachine.c_str());
269 SCRUTE(containerNameInNS);
270 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
271 if ( !CORBA::is_nil(obj) )
275 Engines::Container_var cont=Engines::Container::_narrow(obj);
276 if(!cont->_non_existent())
278 if(std::string(params.mode.in())=="getorstart"||std::string(params.mode.in())=="get")
279 return cont._retn(); /* the container exists and params.mode is getorstart or get use it*/
281 cont->Shutdown(); // shutdown the registered container if it exists
284 catch(CORBA::Exception&)
286 INFOS("CORBA::Exception ignored.");
290 //try to launch a new container
291 MESSAGE("try to launch it on " << theMachine);
295 MESSAGE("SALOME_ContainerManager::StartContainer : no possible computer");
296 return Engines::Container::_nil();
298 else if(theMachine==Kernel_Utils::GetHostname())
299 command = BuildCommandToLaunchLocalContainer(params,container_exe);
301 command = BuildCommandToLaunchRemoteContainer(theMachine,params,container_exe);
303 //redirect stdout and stderr in a file
304 string logFilename="/tmp/"+_NS->ContainerName(params)+"_"+ theMachine +"_"+getenv( "USER" )+".log" ;
305 command += " > " + logFilename + " 2>&1 &";
307 // launch container with a system call
308 int status=system(command.c_str());
311 MESSAGE("SALOME_ContainerManager::StartContainer rsh failed (system command status -1)");
312 RmTmpFile(_TmpFileName); // command file can be removed here
313 return Engines::Container::_nil();
315 else if (status == 217){
316 MESSAGE("SALOME_ContainerManager::StartContainer rsh failed (system command status 217)");
317 RmTmpFile(_TmpFileName); // command file can be removed here
318 return Engines::Container::_nil();
321 int count=TIME_OUT_TO_LAUNCH_CONT;
322 MESSAGE("count = "<<count);
323 while ( CORBA::is_nil(ret) && count ){
331 MESSAGE( count << ". Waiting for container on " << theMachine);
333 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
334 ret=Engines::Container::_narrow(obj);
337 if ( CORBA::is_nil(ret) )
339 MESSAGE("SALOME_ContainerManager::StartContainer rsh failed");
343 logFilename=":"+logFilename;
344 logFilename="@"+Kernel_Utils::GetHostname()+logFilename;
345 logFilename=getenv( "USER" )+logFilename;
346 ret->logfilename(logFilename.c_str());
349 RmTmpFile(_TmpFileName); // command file can be removed here
354 //=============================================================================
355 //! Start a suitable Container for a list of components with constraints and a policy
357 * \param params Machine Parameters required for the container
358 * \param policy policy to use (first,cycl or best)
359 * \param componentList list of component to be loaded on this container
361 //=============================================================================
363 Engines::Container_ptr
364 SALOME_ContainerManager::
365 StartContainer(const Engines::MachineParameters& params,
366 Engines::ResPolicy policy,
367 const Engines::CompoList& componentList)
369 Engines::MachineList_var possibleComputers = _ResManager->GetFittingResources(params,componentList);
371 // Look into ModulCatalog if a specific container must be launched
372 CORBA::String_var container_exe;
376 CORBA::Object_var obj = _NS->Resolve("/Kernel/ModulCatalog");
377 SALOME_ModuleCatalog::ModuleCatalog_var Catalog = SALOME_ModuleCatalog::ModuleCatalog::_narrow(obj) ;
378 if (CORBA::is_nil (Catalog))
379 return Engines::Container::_nil();
380 // Loop through component list
381 for(unsigned int i=0;i<componentList.length();i++)
383 const char* compoi = componentList[i];
384 SALOME_ModuleCatalog::Acomponent_var compoInfo = Catalog->GetComponent(compoi);
385 if (CORBA::is_nil (compoInfo))
389 SALOME_ModuleCatalog::ImplType impl=compoInfo->implementation_type();
390 container_exe=compoInfo->implementation_name();
391 if(impl==SALOME_ModuleCatalog::CEXE)
395 INFOS("ContainerManager Error: you can't have 2 CEXE component in the same container" );
396 return Engines::Container::_nil();
402 catch (ServiceUnreachable&)
404 INFOS("Caught exception: Naming Service Unreachable");
405 return Engines::Container::_nil();
409 INFOS("Caught unknown exception.");
410 return Engines::Container::_nil();
414 return StartContainer(params,possibleComputers,policy,container_exe.in());
416 return StartContainer(params,possibleComputers,policy);
419 #ifdef WITH_PACO_PARALLEL
420 //=============================================================================
422 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
423 * \param params Machine Parameters required for the container
424 * \param possibleComputers list of machines usable for find or start
426 * \return CORBA container reference.
428 //=============================================================================
429 Engines::Container_ptr
430 SALOME_ContainerManager::
431 FindOrStartParallelContainer(const Engines::MachineParameters& params_const,
432 const Engines::MachineList& possibleComputers)
434 CORBA::Object_var obj;
435 PaCO::InterfaceManager_var proxy;
436 Engines::Container_ptr ret = Engines::Container::_nil();
437 Engines::MachineParameters params(params_const);
439 // Step 1 : Try to find a suitable container
440 // Currently not as good as could be since
441 // we have to verified the number of nodes of the container
442 // if a user tell that.
443 ret = FindContainer(params, possibleComputers);
445 if(CORBA::is_nil(ret)) {
446 // Step 2 : Starting a new parallel container
447 INFOS("[FindOrStartParallelContainer] Starting a parallel container");
449 // Step 2.1 : Choose a computer
450 string theMachine = _ResManager->FindFirst(possibleComputers);
451 if(theMachine == "") {
452 INFOS("[FindOrStartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
453 INFOS("[FindOrStartParallelContainer] No possible computer found");
454 INFOS("[FindOrStartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
457 INFOS("[FindOrStartParallelContainer] on machine : " << theMachine);
459 if(theMachine == Kernel_Utils::GetHostname()) {
460 // Step 3 : starting parallel container proxy
461 params.hostname = CORBA::string_dup(theMachine.c_str());
462 Engines::MachineParameters params_proxy(params);
464 command = BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerProxy", params_proxy, "xterm");
466 catch(const SALOME_Exception & ex){
468 return Engines::Container::_nil();
470 // LaunchParallelContainer uses this value to know if it launches the proxy or the nodes
471 params_proxy.nb_component_nodes = 0;
472 obj = LaunchParallelContainer(command, params_proxy, _NS->ContainerName(params));
473 ret = Engines::Container::_narrow(obj);
474 proxy = PaCO::InterfaceManager::_narrow(obj);
476 // Step 4 : starting parallel container nodes
477 command = BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerNode", params, "xterm");
478 string name = _NS->ContainerName(params) + "Node";
479 LaunchParallelContainer(command, params, name);
480 // Step 5 : connecting nodes and the proxy to actually create a parallel container
482 for (int i = 0; i < params.nb_component_nodes; i++) {
486 snprintf(buffer,5,"%d",i);
488 _snprintf(buffer,5,"%d",i);
490 string name_cont = name + string(buffer);
492 string theNodeMachine(CORBA::string_dup(params.hostname));
493 string containerNameInNS = _NS->BuildContainerNameForNS(name_cont.c_str(),theNodeMachine.c_str());
494 int count = TIME_OUT_TO_LAUNCH_CONT;
495 obj = _NS->Resolve(containerNameInNS.c_str());
496 while (CORBA::is_nil(obj) && count) {
497 INFOS("[FindOrStartParallelContainer] CONNECTION FAILED !!!!!!!!!!!!!!!!!!!!!!!!");
504 obj = _NS->Resolve(containerNameInNS.c_str());
507 PaCO::InterfaceParallel_var node = PaCO::InterfaceParallel::_narrow(obj);
508 MESSAGE("[FindOrStartParallelContainer] Deploying node : " << name);
513 catch(CORBA::SystemException& e)
515 INFOS("Caught CORBA::SystemException. : " << e);
517 catch(PortableServer::POA::ServantAlreadyActive&)
519 INFOS("Caught CORBA::ServantAlreadyActiveException");
521 catch(CORBA::Exception&)
523 INFOS("Caught CORBA::Exception.");
525 catch(std::exception& exc)
527 INFOS("Caught std::exception - "<<exc.what());
531 INFOS("Caught unknown exception.");
533 INFOS("[FindOrStartParallelContainer] node " << name << " deployed");
536 INFOS("[FindOrStartParallelContainer] Currently parallel containers are launched only on the local host");
543 //=============================================================================
545 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
546 * \param params Machine Parameters required for the container
547 * \param possibleComputers list of machines usable for find or start
549 * \return CORBA container reference.
551 //=============================================================================
552 Engines::Container_ptr
553 SALOME_ContainerManager::
554 FindOrStartParallelContainer(const Engines::MachineParameters& params,
555 const Engines::MachineList& possibleComputers)
557 Engines::Container_ptr ret = Engines::Container::_nil();
558 INFOS("[FindOrStartParallelContainer] is disabled !");
559 INFOS("[FindOrStartParallelContainer] recompile SALOME Kernel to enable parallel extension");
564 //=============================================================================
565 //! Give a suitable Container for a list of components with constraints and a policy
567 * \param params Machine Parameters required for the container
568 * \param policy policy to use (first,cycl or best)
569 * \param componentList list of component to be loaded on this container
571 //=============================================================================
573 Engines::Container_ptr
574 SALOME_ContainerManager::
575 GiveContainer(const Engines::MachineParameters& params,
576 Engines::ResPolicy policy,
577 const Engines::CompoList& componentList)
579 char *valenv=getenv("SALOME_BATCH");
581 if (strcmp(valenv,"1")==0)
583 if(_batchLaunchedContainers.empty())
584 fillBatchLaunchedContainers();
586 if (_batchLaunchedContainersIter == _batchLaunchedContainers.end())
587 _batchLaunchedContainersIter = _batchLaunchedContainers.begin();
589 Engines::Container_ptr rtn = Engines::Container::_duplicate(*_batchLaunchedContainersIter);
590 _batchLaunchedContainersIter++;
593 return StartContainer(params,policy,componentList);
596 //=============================================================================
600 //=============================================================================
602 Engines::Container_ptr
603 SALOME_ContainerManager::
604 FindContainer(const Engines::MachineParameters& params,
605 const char *theMachine)
607 string containerNameInNS(_NS->BuildContainerNameForNS(params,theMachine));
608 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
611 if(obj->_non_existent())
612 return Engines::Container::_nil();
614 return Engines::Container::_narrow(obj);
616 catch(const CORBA::Exception& e)
618 return Engines::Container::_nil();
622 //=============================================================================
626 //=============================================================================
628 Engines::Container_ptr
629 SALOME_ContainerManager::
630 FindContainer(const Engines::MachineParameters& params,
631 const Engines::MachineList& possibleComputers)
633 MESSAGE("FindContainer "<<possibleComputers.length());
634 for(unsigned int i=0;i<possibleComputers.length();i++)
636 MESSAGE("FindContainer possible " << possibleComputers[i]);
637 Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
638 if( !CORBA::is_nil(cont) )
641 MESSAGE("FindContainer: not found");
642 return Engines::Container::_nil();
645 //=============================================================================
646 /*! This method launches the parallel container.
647 * It will may be placed on the ressources manager.
649 * \param command to launch
650 * \param container's parameters
651 * \param name of the container
653 * \return CORBA container reference
655 //=============================================================================
657 SALOME_ContainerManager::LaunchParallelContainer(const std::string& command,
658 const Engines::MachineParameters& params,
659 const std::string& name)
661 CORBA::Object_ptr obj = CORBA::Object::_nil();
662 string containerNameInNS;
663 MESSAGE("[LaunchParallelContainer] : command to launch...");
665 if (params.nb_component_nodes == 0) {
666 INFOS("[LaunchParallelContainer] launching the proxy of the parallel container");
667 int status = system(command.c_str());
669 INFOS("[LaunchParallelContainer] failed : system command status -1");
671 else if (status == 217) {
672 INFOS("[LaunchParallelContainer] failed : system command status 217");
675 int count = TIME_OUT_TO_LAUNCH_CONT;
676 string theMachine(CORBA::string_dup(params.hostname));
677 containerNameInNS = _NS->BuildContainerNameForNS((char*) name.c_str(),theMachine.c_str());
679 INFOS("[LaunchParallelContainer] Waiting for Parallel Container proxy on " << theMachine);
680 while (CORBA::is_nil(obj) && count) {
687 obj = _NS->Resolve(containerNameInNS.c_str());
691 INFOS("[LaunchParallelContainer] launching the nodes of the parallel container");
692 int status = system(command.c_str());
694 INFOS("[LaunchParallelContainer] failed : system command status -1");
696 else if (status == 217) {
697 INFOS("[LaunchParallelContainer] failed : system command status 217");
699 // We are waiting all the nodes
700 for (int i = 0; i < params.nb_component_nodes; i++) {
701 obj = CORBA::Object::_nil();
702 int count = TIME_OUT_TO_LAUNCH_CONT;
707 snprintf(buffer,5,"%d",i);
709 _snprintf(buffer,5,"%d",i);
712 string name_cont = name + string(buffer);
714 // I don't like this...
715 string theMachine(CORBA::string_dup(params.hostname));
716 containerNameInNS = _NS->BuildContainerNameForNS((char*) name_cont.c_str(),theMachine.c_str());
717 cerr << "[LaunchContainer] Waiting for Parllel Container node " << containerNameInNS << " on " << theMachine << endl;
718 while (CORBA::is_nil(obj) && count) {
725 obj = _NS->Resolve(containerNameInNS.c_str());
730 if ( CORBA::is_nil(obj) ) {
731 INFOS("[LaunchParallelContainer] failed");
736 void SALOME_ContainerManager::fillBatchLaunchedContainers()
738 _batchLaunchedContainers.clear();
739 _NS->Change_Directory("/Containers");
740 vector<string> vec = _NS->list_directory_recurs();
741 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++){
742 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
743 Engines::Container_ptr cont=Engines::Container::_narrow(obj);
744 if(!CORBA::is_nil(cont)){
745 _batchLaunchedContainers.push_back(cont);
748 _batchLaunchedContainersIter=_batchLaunchedContainers.begin();
751 //=============================================================================
753 * This is no longer valid (C++ container are also python containers)
755 //=============================================================================
757 bool isPythonContainer(const char* ContainerName)
760 int len = strlen(ContainerName);
763 if (strcmp(ContainerName + len - 2, "Py") == 0)
769 //=============================================================================
771 * Builds the script to be launched
773 * If SALOME Application not defined ($APPLI),
774 * see BuildTempFileToLaunchRemoteContainer()
776 * Else rely on distant configuration. Command is under the form (example):
777 * ssh user@machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \
778 * SALOME_Container containerName &"
780 * - where user is ommited if not specified in CatalogResources,
781 * - where distant path is always relative to user@machine $HOME, and
782 * equal to $APPLI if not specified in CatalogResources,
783 * - where hostNS is the hostname of CORBA naming server (set by scripts to
784 * use to launch SALOME and servers in $APPLI: runAppli.sh, runRemote.sh)
785 * - where portNS is the port used by CORBA naming server (set by scripts to
786 * use to launch SALOME and servers in $APPLI: runAppli.sh, runRemote.sh)
787 * - where workingdir is the requested working directory for the container.
788 * If WORKINGDIR (and workingdir) is not present the working dir will be $HOME
790 //=============================================================================
793 SALOME_ContainerManager::BuildCommandToLaunchRemoteContainer
794 (const string& machine,
795 const Engines::MachineParameters& params, const std::string& container_exe)
800 if ( ! _isAppliSalomeDefined )
801 command = BuildTempFileToLaunchRemoteContainer(machine, params);
805 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine);
809 if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) )
811 else if ( params.nb_node == 0 )
812 nbproc = params.nb_proc_per_node;
813 else if ( params.nb_proc_per_node == 0 )
814 nbproc = params.nb_node;
816 nbproc = params.nb_node * params.nb_proc_per_node;
819 // "ssh user@machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \
820 // SALOME_Container containerName &"
822 if (resInfo.Protocol == rsh)
824 else if (resInfo.Protocol == ssh)
827 throw SALOME_Exception("Unknown protocol");
829 if (resInfo.UserName != "")
831 command += resInfo.UserName;
838 if (resInfo.AppliPath != "")
839 command += resInfo.AppliPath; // path relative to user@machine $HOME
842 ASSERT(getenv("APPLI"));
843 command += getenv("APPLI"); // path relative to user@machine $HOME
846 command += "/runRemote.sh ";
848 ASSERT(getenv("NSHOST"));
849 command += getenv("NSHOST"); // hostname of CORBA name server
852 ASSERT(getenv("NSPORT"));
853 command += getenv("NSPORT"); // port of CORBA name server
855 std::string wdir=params.workingdir.in();
858 command += " WORKINGDIR ";
860 if(wdir == "$TEMPDIR")
862 command += wdir; // requested working directory
868 command += " mpirun -np ";
869 std::ostringstream o;
873 command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
874 #elif defined(WITHOPENMPI)
875 if( getenv("OMPI_URI_FILE") == NULL )
876 command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
878 command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
879 command += getenv("OMPI_URI_FILE");
882 command += " SALOME_MPIContainer ";
885 command += " " +container_exe+ " ";
887 command += _NS->ContainerName(params);
889 AddOmninamesParams(command);
891 MESSAGE("command =" << command);
897 //=============================================================================
899 * builds the command to be launched.
901 //=============================================================================
904 SALOME_ContainerManager::BuildCommandToLaunchLocalContainer
905 (const Engines::MachineParameters& params, const std::string& container_exe)
907 _TmpFileName = BuildTemporaryFileName();
911 ofstream command_file( _TmpFileName.c_str() );
915 //command = "mpirun -np ";
916 command_file << "mpirun -np ";
918 if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) )
920 else if ( params.nb_node == 0 )
921 nbproc = params.nb_proc_per_node;
922 else if ( params.nb_proc_per_node == 0 )
923 nbproc = params.nb_node;
925 nbproc = params.nb_node * params.nb_proc_per_node;
927 //std::ostringstream o;
929 //o << nbproc << " ";
930 command_file << nbproc << " ";
932 //command += o.str();
934 //command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
935 command_file << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
936 #elif defined(WITHOPENMPI)
937 //command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace ";
938 if( getenv("OMPI_URI_FILE") == NULL )
939 command_file << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
942 command_file << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
943 command_file << getenv("OMPI_URI_FILE");
947 if (isPythonContainer(params.container_name))
948 //command += "pyMPI SALOME_ContainerPy.py ";
949 command_file << " pyMPI SALOME_ContainerPy.py ";
951 //command += "SALOME_MPIContainer ";
952 command_file << " SALOME_MPIContainer ";
958 std::string wdir=params.workingdir.in();
961 // a working directory is requested
962 if(wdir == "$TEMPDIR")
964 // a new temporary directory is requested
965 string dir = Kernel_Utils::GetTmpDir();
967 //command += "cd /d "+ dir +";";
968 command_file << "cd /d " << dir << endl;
970 //command = "cd "+ dir +";";
971 command_file << "cd " << dir << ";";
977 // a permanent directory is requested use it or create it
979 //command="mkdir " + wdir;
980 command_file << "mkdir " + wdir << endl;
981 command_file << "cd /D " + wdir << endl;
983 //command="mkdir -p " + wdir + " && cd " + wdir + ";";
984 command_file << "mkdir -p " << wdir << " && cd " << wdir + ";";
988 if (isPythonContainer(params.container_name))
989 //command += "SALOME_ContainerPy.py ";
990 command_file << "SALOME_ContainerPy.py ";
992 //command += container_exe + " ";
993 command_file << container_exe + " ";
997 command_file << _NS->ContainerName(params);
998 command_file << " -";
999 AddOmninamesParams(command_file);
1000 command_file.close();
1003 chmod(_TmpFileName.c_str(), 0x1ED);
1005 command = _TmpFileName;
1007 MESSAGE("Command is file ... " << command);
1012 //=============================================================================
1014 * removes the generated temporary file in case of a remote launch.
1016 //=============================================================================
1018 void SALOME_ContainerManager::RmTmpFile(std::string& tmpFileName)
1020 int lenght = tmpFileName.size();
1024 string command = "del /F ";
1026 string command = "rm ";
1029 command += tmpFileName.substr(0, lenght - 3 );
1031 command += tmpFileName;
1033 system(command.c_str());
1034 //if dir is empty - remove it
1035 string tmp_dir = Kernel_Utils::GetDirByPath( tmpFileName );
1036 if ( Kernel_Utils::IsEmptyDir( tmp_dir ) )
1039 command = "del /F " + tmp_dir;
1041 command = "rmdir " + tmp_dir;
1043 system(command.c_str());
1048 //=============================================================================
1050 * add to command all options relative to naming service.
1052 //=============================================================================
1054 void SALOME_ContainerManager::AddOmninamesParams(string& command) const
1056 CORBA::String_var iorstr = _NS->getIORaddr();
1057 command += "ORBInitRef NameService=";
1062 //=============================================================================
1064 * add to command all options relative to naming service.
1066 //=============================================================================
1068 void SALOME_ContainerManager::AddOmninamesParams(ofstream& fileStream) const
1070 CORBA::String_var iorstr = _NS->getIORaddr();
1071 fileStream << "ORBInitRef NameService=";
1072 fileStream << iorstr;
1075 //=============================================================================
1077 * generate a file name in /tmp directory
1079 //=============================================================================
1081 string SALOME_ContainerManager::BuildTemporaryFileName() const
1083 //build more complex file name to support multiple salome session
1084 string aFileName = Kernel_Utils::GetTmpFileName();
1088 aFileName += ".bat";
1094 //=============================================================================
1096 * Builds in a temporary file the script to be launched.
1098 * Used if SALOME Application ($APPLI) is not defined.
1099 * The command is build with data from CatalogResources, in which every path
1100 * used on remote computer must be defined.
1102 //=============================================================================
1105 SALOME_ContainerManager::BuildTempFileToLaunchRemoteContainer
1106 (const string& machine,
1107 const Engines::MachineParameters& params) throw(SALOME_Exception)
1111 _TmpFileName = BuildTemporaryFileName();
1112 ofstream tempOutputFile;
1113 tempOutputFile.open(_TmpFileName.c_str(), ofstream::out );
1114 const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine);
1115 tempOutputFile << "#! /bin/sh" << endl;
1119 tempOutputFile << "export SALOME_trace=local" << endl; // mkr : 27.11.2006 : PAL13967 - Distributed supervision graphs - Problem with "SALOME_trace"
1120 //tempOutputFile << "source " << resInfo.PreReqFilePath << endl;
1126 tempOutputFile << "mpirun -np ";
1129 if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) )
1131 else if ( params.nb_node == 0 )
1132 nbproc = params.nb_proc_per_node;
1133 else if ( params.nb_proc_per_node == 0 )
1134 nbproc = params.nb_node;
1136 nbproc = params.nb_node * params.nb_proc_per_node;
1138 std::ostringstream o;
1140 tempOutputFile << nbproc << " ";
1142 tempOutputFile << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
1143 #elif defined(WITHOPENMPI)
1144 if( getenv("OMPI_URI_FILE") == NULL )
1145 tempOutputFile << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
1147 tempOutputFile << "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
1148 tempOutputFile << getenv("OMPI_URI_FILE");
1153 tempOutputFile << getenv("KERNEL_ROOT_DIR") << "/bin/salome/";
1157 if (isPythonContainer(params.container_name))
1158 tempOutputFile << " pyMPI SALOME_ContainerPy.py ";
1160 tempOutputFile << " SALOME_MPIContainer ";
1165 if (isPythonContainer(params.container_name))
1166 tempOutputFile << "SALOME_ContainerPy.py ";
1168 tempOutputFile << "SALOME_Container ";
1171 tempOutputFile << _NS->ContainerName(params) << " -";
1172 AddOmninamesParams(tempOutputFile);
1173 tempOutputFile << " &" << endl;
1174 tempOutputFile.flush();
1175 tempOutputFile.close();
1177 chmod(_TmpFileName.c_str(), 0x1ED);
1180 // --- Build command
1184 if (resInfo.Protocol == rsh)
1187 string commandRcp = "rcp ";
1188 commandRcp += _TmpFileName;
1190 commandRcp += machine;
1192 commandRcp += _TmpFileName;
1193 status = system(commandRcp.c_str());
1196 else if (resInfo.Protocol == ssh)
1199 string commandRcp = "scp ";
1200 commandRcp += _TmpFileName;
1202 commandRcp += machine;
1204 commandRcp += _TmpFileName;
1205 status = system(commandRcp.c_str());
1208 throw SALOME_Exception("Unknown protocol");
1211 throw SALOME_Exception("Error of connection on remote host");
1214 _CommandForRemAccess = command;
1216 command += _TmpFileName;
1224 //=============================================================================
1225 /*! Creates a command line that the container manager uses to launch
1226 * a parallel container.
1228 //=============================================================================
1230 SALOME_ContainerManager::BuildCommandToLaunchLocalParallelContainer(const std::string& exe_name,
1231 const Engines::MachineParameters& params,
1232 const std::string& log)
1234 // This method knows the differences between the proxy and the nodes.
1235 // nb_component_nodes is not used in the same way if it is a proxy or
1239 string parallelLib(CORBA::string_dup(params.parallelLib));
1240 string hostname(CORBA::string_dup(params.hostname));
1241 int par = exe_name.find("Proxy");
1242 int nbproc = params.nb_component_nodes;
1244 sprintf(buffer,"%d",nbproc);
1246 Engines::MachineParameters_var rtn = new Engines::MachineParameters();
1247 rtn->container_name = params.container_name;
1248 rtn->hostname = params.hostname;
1249 rtn->OS = params.OS;
1250 rtn->mem_mb = params.mem_mb;
1251 rtn->cpu_clock = params.cpu_clock;
1252 rtn->nb_proc_per_node = params.nb_proc_per_node;
1253 rtn->nb_node = params.nb_node;
1254 rtn->isMPI = params.isMPI;
1256 string real_exe_name = exe_name + parallelLib;
1258 if (parallelLib == "Dummy")
1260 //command = "gdb --args ";
1261 //command = "valgrind --tool=memcheck --log-file=val_log ";
1262 //command += real_exe_name;
1264 command = real_exe_name;
1266 command += " " + _NS->ContainerName(rtn);
1267 command += " " + parallelLib;
1268 command += " " + hostname;
1270 AddOmninamesParams(command);
1273 else if (parallelLib == "Mpi")
1275 // Step 1 : check if MPI is started
1276 if (_MpiStarted == false)
1285 command = "mpiexec -np " + string(buffer) + " ";
1286 // command += "gdb --args ";
1287 command += real_exe_name;
1288 command += " " + _NS->ContainerName(rtn);
1289 command += " " + parallelLib;
1290 command += " " + hostname;
1292 AddOmninamesParams(command);
1297 command = "mpiexec -np 1 ";
1298 command += real_exe_name;
1299 command += " " + _NS->ContainerName(rtn);
1300 command += " " + string(buffer);
1301 command += " " + parallelLib;
1302 command += " " + hostname;
1304 AddOmninamesParams(command);
1309 std::string message("Unknown parallelLib" + parallelLib);
1310 throw SALOME_Exception(message.c_str());
1314 if (log == "default")
1316 command += " > /tmp/";
1317 command += _NS->ContainerName(rtn);
1319 command += Kernel_Utils::GetHostname();
1321 command += getenv( "USER" ) ;
1322 command += ".log 2>&1 &" ;
1326 command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; "
1327 + command + " \" &";
1328 // + command + "; echo $LD_LIBRARY_PATH; cat \" &";
1332 /* if (log == "xterm")
1334 command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; echo $LD_LIBRARY_PATH; echo $PATH; " + command + "; cat \" &";
1337 /* command = "cd ; rm " + fichier_commande + "; touch " + \
1338 fichier_commande + "; echo \" export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; " + \
1339 command + " >& /tmp/ribes_" + fichier_commande + " & \" > " + fichier_commande + ";";
1340 command += "ssh cn01 sh " + fichier_commande + " &";
1341 cerr << "La commande : " << command << endl;
1345 void SALOME_ContainerManager::startMPI()
1347 cerr << "----------------------------------------------" << endl;
1348 cerr << "----------------------------------------------" << endl;
1349 cerr << "----------------------------------------------" << endl;
1350 cerr << "-Only Lam on Localhost is currently supported-" << endl;
1351 cerr << "----------------------------------------------" << endl;
1352 cerr << "----------------------------------------------" << endl;
1353 cerr << "----------------------------------------------" << endl;
1355 int status = system("lamboot");
1358 INFOS("lamboot failed : system command status -1");
1360 else if (status == 217)
1362 INFOS("lamboot failed : system command status 217");
1370 string SALOME_ContainerManager::GetMPIZeroNode(string machine)
1375 string tmpFile = BuildTemporaryFileName();
1377 cmd = "ssh " + machine + " mpirun -np 1 hostname > " + tmpFile;
1379 status = system(cmd.c_str());
1381 ifstream fp(tmpFile.c_str(),ios::in);