1 // Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
2 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License as published by the Free Software Foundation; either
7 // version 2.1 of the License.
9 // This library is distributed in the hope that it will be useful
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 // Lesser General Public License for more details.
14 // You should have received a copy of the GNU Lesser General Public
15 // License along with this library; if not, write to the Free Software
16 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
20 #include "SALOME_ContainerManager.hxx"
21 #include "SALOME_NamingService.hxx"
23 #include <sys/types.h>
28 #include "Utils_CorbaException.hxx"
29 #include "Batch_Date.hxx"
31 #ifdef WITH_PACO_PARALLEL
35 #define TIME_OUT_TO_LAUNCH_CONT 21
39 vector<Engines::Container_ptr> SALOME_ContainerManager::_batchLaunchedContainers;
41 vector<Engines::Container_ptr>::iterator SALOME_ContainerManager::_batchLaunchedContainersIter;
43 const char *SALOME_ContainerManager::_ContainerManagerNameInNS =
46 //=============================================================================
50 * Define a CORBA single thread policy for the server, which avoid to deal
51 * with non thread-safe usage like Change_Directory in SALOME naming service
53 //=============================================================================
55 SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, SALOME_ResourcesManager *rm, SALOME_NamingService *ns)
57 MESSAGE("constructor");
62 PortableServer::POAManager_var pman = poa->the_POAManager();
63 _orb = CORBA::ORB::_duplicate(orb) ;
64 CORBA::PolicyList policies;
66 PortableServer::ThreadPolicy_var threadPol =
67 poa->create_thread_policy(PortableServer::SINGLE_THREAD_MODEL);
68 policies[0] = PortableServer::ThreadPolicy::_duplicate(threadPol);
70 _poa = poa->create_POA("SThreadPOA",pman,policies);
72 PortableServer::ObjectId_var id = _poa->activate_object(this);
73 CORBA::Object_var obj = _poa->id_to_reference(id);
74 Engines::ContainerManager_var refContMan =
75 Engines::ContainerManager::_narrow(obj);
77 _NS->Register(refContMan,_ContainerManagerNameInNS);
78 MESSAGE("constructor end");
81 //=============================================================================
85 //=============================================================================
87 SALOME_ContainerManager::~SALOME_ContainerManager()
89 MESSAGE("destructor");
92 //=============================================================================
94 * shutdown all the containers, then the ContainerManager servant
96 //=============================================================================
98 void SALOME_ContainerManager::Shutdown()
101 ShutdownContainers();
102 _NS->Destroy_Name(_ContainerManagerNameInNS);
103 PortableServer::ObjectId_var oid = _poa->servant_to_id(this);
104 _poa->deactivate_object(oid);
105 //_remove_ref() has already been done at creation
109 //=============================================================================
111 * Loop on all the containers listed in naming service, ask shutdown on each
113 //=============================================================================
115 void SALOME_ContainerManager::ShutdownContainers()
117 MESSAGE("ShutdownContainers");
119 isOK = _NS->Change_Directory("/Containers");
121 vector<string> vec = _NS->list_directory_recurs();
122 list<string> lstCont;
123 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++){
125 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
126 Engines::Container_var cont=Engines::Container::_narrow(obj);
127 if(!CORBA::is_nil(cont)){
128 lstCont.push_back((*iter));
131 MESSAGE("Container list: ");
132 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++){
135 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++){
137 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
138 Engines::Container_var cont=Engines::Container::_narrow(obj);
139 if(!CORBA::is_nil(cont))
141 MESSAGE("ShutdownContainers: " << (*iter));
146 catch(CORBA::SystemException& e)
148 INFOS("CORBA::SystemException ignored : " << e);
150 catch(CORBA::Exception&)
152 INFOS("CORBA::Exception ignored.");
156 INFOS("Unknown exception ignored.");
160 MESSAGE("ShutdownContainers: no container ref for " << (*iter));
165 //=============================================================================
167 * Find a suitable Container in a list of machines, or start one
168 * \param params Machine Parameters required for the container
169 * \param possibleComputers list of machines usable for find or start
171 //=============================================================================
173 Engines::Container_ptr
174 SALOME_ContainerManager::
175 FindOrStartContainer(const Engines::MachineParameters& params,
176 const Engines::MachineList& possibleComputers)
178 Engines::Container_ptr ret = FindContainer(params,possibleComputers);
179 if(!CORBA::is_nil(ret))
181 MESSAGE("Container doesn't exist try to launch it ...");
183 return StartContainer(params,possibleComputers,Engines::P_FIRST);
187 //=============================================================================
189 * Start a suitable Container in a list of machines
190 * \param params Machine Parameters required for the container
191 * \param possibleComputers list of machines usable for start
193 //=============================================================================
195 Engines::Container_ptr
196 SALOME_ContainerManager::
197 StartContainer(const Engines::MachineParameters& params,
198 const Engines::MachineList& possibleComputers,
199 Engines::ResPolicy policy)
201 #ifdef WITH_PACO_PARALLEL
202 std::string parallelLib(params.parallelLib);
203 if (parallelLib != "")
204 return FindOrStartParallelContainer(params, possibleComputers);
207 string containerNameInNS;
208 char idc[3*sizeof(long)];
209 Engines::Container_ptr ret = Engines::Container::_nil();
211 MESSAGE("SALOME_ContainerManager::StartContainer " <<
212 possibleComputers.length());
217 case Engines::P_FIRST:
218 theMachine=_ResManager->FindFirst(possibleComputers);
220 case Engines::P_CYCL:
221 theMachine=_ResManager->FindNext(possibleComputers);
223 case Engines::P_BEST:
224 theMachine=_ResManager->FindBest(possibleComputers);
228 catch( const SALOME_Exception &ex ){
230 return Engines::Container::_nil();
233 //If the machine name is localhost use the real name
234 if(theMachine == "localhost")
235 theMachine=GetHostname();
237 MESSAGE("try to launch it on " << theMachine);
239 // Get Id for container: a parallel container registers in Naming Service
240 // on the machine where is process 0. ContainerManager does'nt know the name
241 // of this machine before the launch of the parallel container. So to get
242 // the IOR of the parallel container in Naming Service, ContainerManager
243 // gives a unique Id. The parallel container registers his name under
244 // /ContainerManager/Id directory in NamingService
246 id = GetIdForContainer();
250 MESSAGE("SALOME_ContainerManager::StartContainer : " <<
251 "no possible computer");
252 return Engines::Container::_nil();
254 else if(theMachine==GetHostname())
255 command=_ResManager->BuildCommandToLaunchLocalContainer(params,id);
257 command = _ResManager->BuildCommandToLaunchRemoteContainer(theMachine,params,id);
259 _ResManager->RmTmpFile();
261 //check if an entry exists in Naming service
264 containerNameInNS = "/ContainerManager/id";
265 sprintf(idc,"%ld",id);
266 containerNameInNS += idc;
269 containerNameInNS = _NS->BuildContainerNameForNS(params,theMachine.c_str());
271 SCRUTE(containerNameInNS);
272 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
273 if ( !CORBA::is_nil(obj) )
275 // unregister the registered container if it exists
276 _NS->Destroy_Name(containerNameInNS.c_str());
277 // unregister component instances ???
278 //Engines::Container_var cont=Engines::Container::_narrow(obj);
281 //redirect stdout and stderr in a file
282 string logFilename="/tmp/"+_NS->ContainerName(params)+"_"+GetHostname()+"_"+getenv( "USER" )+".log" ;
283 command += " > " + logFilename + " 2>&1 &";
285 // launch container with a system call
286 int status=system(command.c_str());
288 MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed " <<
289 "(system command status -1)");
290 return Engines::Container::_nil();
292 else if (status == 217){
293 MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed " <<
294 "(system command status 217)");
295 return Engines::Container::_nil();
298 int count=TIME_OUT_TO_LAUNCH_CONT;
299 MESSAGE("count = "<<count);
300 while ( CORBA::is_nil(ret) && count ){
308 MESSAGE( count << ". Waiting for container on " << theMachine);
310 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
311 ret=Engines::Container::_narrow(obj);
314 if ( CORBA::is_nil(ret) )
316 MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed");
320 logFilename=":"+logFilename;
321 logFilename="@"+GetHostname()+logFilename;
322 logFilename=getenv( "USER" )+logFilename;
323 ret->logfilename(logFilename.c_str());
330 //=============================================================================
332 * Start a suitable Container in a list of machines
333 * \param params Machine Parameters required for the container
334 * \param possibleComputers list of machines usable for start
336 //=============================================================================
338 Engines::Container_ptr
339 SALOME_ContainerManager::
340 StartContainer(const Engines::MachineParameters& params,
341 Engines::ResPolicy policy,
342 const Engines::CompoList& componentList)
344 Engines::MachineList_var possibleComputers = _ResManager->GetFittingResources(params,componentList);
345 return StartContainer(params,possibleComputers,policy);
348 #ifdef WITH_PACO_PARALLEL
349 //=============================================================================
351 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
352 * \param params Machine Parameters required for the container
353 * \param possibleComputers list of machines usable for find or start
355 * \return CORBA container reference.
357 //=============================================================================
358 Engines::Container_ptr
359 SALOME_ContainerManager::
360 FindOrStartParallelContainer(const Engines::MachineParameters& params_const,
361 const Engines::MachineList& possibleComputers)
363 CORBA::Object_var obj;
364 PaCO::InterfaceManager_var proxy;
365 Engines::Container_ptr ret = Engines::Container::_nil();
366 Engines::MachineParameters params(params_const);
368 // Step 1 : Try to find a suitable container
369 // Currently not as good as could be since
370 // we have to verified the number of nodes of the container
371 // if a user tell that.
372 ret = FindContainer(params, possibleComputers);
374 if(CORBA::is_nil(ret)) {
375 // Step 2 : Starting a new parallel container
376 INFOS("[FindOrStartParallelContainer] Starting a parallel container");
378 // Step 2.1 : Choose a computer
379 string theMachine = _ResManager->FindFirst(possibleComputers);
380 if(theMachine == "") {
381 INFOS("[FindOrStartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
382 INFOS("[FindOrStartParallelContainer] No possible computer found");
383 INFOS("[FindOrStartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
386 INFOS("[FindOrStartParallelContainer] on machine : " << theMachine);
388 if(theMachine == GetHostname()) {
389 // Step 3 : starting parallel container proxy
390 params.hostname = CORBA::string_dup(theMachine.c_str());
391 Engines::MachineParameters params_proxy(params);
393 command = _ResManager->BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerProxy", params_proxy, "xterm");
395 catch(const SALOME_Exception & ex){
397 return Engines::Container::_nil();
399 // LaunchParallelContainer uses this value to know if it launches the proxy or the nodes
400 params_proxy.nb_component_nodes = 0;
401 obj = LaunchParallelContainer(command, params_proxy, _NS->ContainerName(params));
402 ret = Engines::Container::_narrow(obj);
403 proxy = PaCO::InterfaceManager::_narrow(obj);
405 // Step 4 : starting parallel container nodes
406 command = _ResManager->BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerNode", params, "xterm");
407 string name = _NS->ContainerName(params) + "Node";
408 LaunchParallelContainer(command, params, name);
409 // Step 5 : connecting nodes and the proxy to actually create a parallel container
411 for (int i = 0; i < params.nb_component_nodes; i++) {
415 snprintf(buffer,5,"%d",i);
417 _snprintf(buffer,5,"%d",i);
419 string name_cont = name + string(buffer);
421 string theNodeMachine(CORBA::string_dup(params.hostname));
422 string containerNameInNS = _NS->BuildContainerNameForNS(name_cont.c_str(),theNodeMachine.c_str());
423 int count = TIME_OUT_TO_LAUNCH_CONT;
424 obj = _NS->Resolve(containerNameInNS.c_str());
425 while (CORBA::is_nil(obj) && count) {
426 INFOS("[FindOrStartParallelContainer] CONNECTION FAILED !!!!!!!!!!!!!!!!!!!!!!!!");
433 obj = _NS->Resolve(containerNameInNS.c_str());
436 PaCO::InterfaceParallel_var node = PaCO::InterfaceParallel::_narrow(obj);
437 MESSAGE("[FindOrStartParallelContainer] Deploying node : " << name);
442 catch(CORBA::SystemException& e)
444 INFOS("Caught CORBA::SystemException. : " << e);
446 catch(PortableServer::POA::ServantAlreadyActive&)
448 INFOS("Caught CORBA::ServantAlreadyActiveException");
450 catch(CORBA::Exception&)
452 INFOS("Caught CORBA::Exception.");
454 catch(std::exception& exc)
456 INFOS("Caught std::exception - "<<exc.what());
460 INFOS("Caught unknown exception.");
462 INFOS("[FindOrStartParallelContainer] node " << name << " deployed");
465 INFOS("[FindOrStartParallelContainer] Currently parallel containers are launched only on the local host");
472 //=============================================================================
474 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
475 * \param params Machine Parameters required for the container
476 * \param possibleComputers list of machines usable for find or start
478 * \return CORBA container reference.
480 //=============================================================================
481 Engines::Container_ptr
482 SALOME_ContainerManager::
483 FindOrStartParallelContainer(const Engines::MachineParameters& params,
484 const Engines::MachineList& possibleComputers)
486 Engines::Container_ptr ret = Engines::Container::_nil();
487 INFOS("[FindOrStartParallelContainer] is disabled !");
488 INFOS("[FindOrStartParallelContainer] recompile SALOME Kernel to enable parallel extension");
493 //=============================================================================
495 * Give a suitable Container in a list of machines
496 * \param params Machine Parameters required for the container
497 * \param possibleComputers list of machines usable for start
499 //=============================================================================
501 Engines::Container_ptr
502 SALOME_ContainerManager::
503 GiveContainer(const Engines::MachineParameters& params,
504 Engines::ResPolicy policy,
505 const Engines::CompoList& componentList)
507 char *valenv=getenv("SALOME_BATCH");
509 if (strcmp(valenv,"1")==0)
511 if(_batchLaunchedContainers.empty())
512 fillBatchLaunchedContainers();
513 return *(_batchLaunchedContainersIter++);
515 return StartContainer(params,policy,componentList);
518 //=============================================================================
522 //=============================================================================
524 Engines::Container_ptr
525 SALOME_ContainerManager::
526 FindContainer(const Engines::MachineParameters& params,
527 const char *theMachine)
529 string containerNameInNS(_NS->BuildContainerNameForNS(params,theMachine));
530 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
531 if( !CORBA::is_nil(obj) )
532 return Engines::Container::_narrow(obj);
534 return Engines::Container::_nil();
537 //=============================================================================
541 //=============================================================================
543 Engines::Container_ptr
544 SALOME_ContainerManager::
545 FindContainer(const Engines::MachineParameters& params,
546 const Engines::MachineList& possibleComputers)
548 MESSAGE("FindContainer "<<possibleComputers.length());
549 for(unsigned int i=0;i<possibleComputers.length();i++)
551 MESSAGE("FindContainer possible " << possibleComputers[i]);
552 Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
553 if( !CORBA::is_nil(cont) )
556 MESSAGE("FindContainer: not found");
557 return Engines::Container::_nil();
560 //=============================================================================
561 /*! This method launches the parallel container.
562 * It will may be placed on the ressources manager.
564 * \param command to launch
565 * \param container's parameters
566 * \param name of the container
568 * \return CORBA container reference
570 //=============================================================================
572 SALOME_ContainerManager::LaunchParallelContainer(const std::string& command,
573 const Engines::MachineParameters& params,
574 const std::string& name)
576 CORBA::Object_ptr obj = CORBA::Object::_nil();
577 string containerNameInNS;
578 MESSAGE("[LaunchParallelContainer] : command to launch...");
580 if (params.nb_component_nodes == 0) {
581 INFOS("[LaunchParallelContainer] launching the proxy of the parallel container");
582 int status = system(command.c_str());
584 INFOS("[LaunchParallelContainer] failed : system command status -1");
586 else if (status == 217) {
587 INFOS("[LaunchParallelContainer] failed : system command status 217");
590 int count = TIME_OUT_TO_LAUNCH_CONT;
591 string theMachine(CORBA::string_dup(params.hostname));
592 containerNameInNS = _NS->BuildContainerNameForNS((char*) name.c_str(),theMachine.c_str());
594 INFOS("[LaunchParallelContainer] Waiting for Parallel Container proxy on " << theMachine);
595 while (CORBA::is_nil(obj) && count) {
602 obj = _NS->Resolve(containerNameInNS.c_str());
606 INFOS("[LaunchParallelContainer] launching the nodes of the parallel container");
607 int status = system(command.c_str());
609 INFOS("[LaunchParallelContainer] failed : system command status -1");
611 else if (status == 217) {
612 INFOS("[LaunchParallelContainer] failed : system command status 217");
614 // We are waiting all the nodes
615 for (int i = 0; i < params.nb_component_nodes; i++) {
616 obj = CORBA::Object::_nil();
617 int count = TIME_OUT_TO_LAUNCH_CONT;
622 snprintf(buffer,5,"%d",i);
624 _snprintf(buffer,5,"%d",i);
627 string name_cont = name + string(buffer);
629 // I don't like this...
630 string theMachine(CORBA::string_dup(params.hostname));
631 containerNameInNS = _NS->BuildContainerNameForNS((char*) name_cont.c_str(),theMachine.c_str());
632 cerr << "[LaunchContainer] Waiting for Parllel Container node " << containerNameInNS << " on " << theMachine << endl;
633 while (CORBA::is_nil(obj) && count) {
640 obj = _NS->Resolve(containerNameInNS.c_str());
645 if ( CORBA::is_nil(obj) ) {
646 INFOS("[LaunchParallelContainer] failed");
651 //=============================================================================
653 * Get Id for container: a parallel container registers in Naming Service
654 * on the machine where is process 0. ContainerManager does'nt know the name
655 * of this machine before the launch of the parallel container. So to get
656 * the IOR of the parallel container in Naming Service, ContainerManager
657 * gives a unique Id. The parallel container registers his name under
658 * /ContainerManager/Id directory in NamingService
660 //=============================================================================
663 long SALOME_ContainerManager::GetIdForContainer(void)
669 void SALOME_ContainerManager::fillBatchLaunchedContainers()
671 _batchLaunchedContainers.clear();
672 _NS->Change_Directory("/Containers");
673 vector<string> vec = _NS->list_directory_recurs();
674 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++){
675 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
676 Engines::Container_ptr cont=Engines::Container::_narrow(obj);
677 if(!CORBA::is_nil(cont)){
678 _batchLaunchedContainers.push_back(cont);
681 _batchLaunchedContainersIter=_batchLaunchedContainers.begin();