1 // Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
2 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License as published by the Free Software Foundation; either
7 // version 2.1 of the License.
9 // This library is distributed in the hope that it will be useful
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 // Lesser General Public License for more details.
14 // You should have received a copy of the GNU Lesser General Public
15 // License along with this library; if not, write to the Free Software
16 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
20 #include "SALOME_ContainerManager.hxx"
21 #include "SALOME_NamingService.hxx"
23 #include <sys/types.h>
28 #include "Utils_CorbaException.hxx"
30 #ifdef WITH_PACO_PARALLEL
34 #define TIME_OUT_TO_LAUNCH_CONT 21
38 const char *SALOME_ContainerManager::_ContainerManagerNameInNS =
41 //=============================================================================
45 * Define a CORBA single thread policy for the server, which avoid to deal
46 * with non thread-safe usage like Change_Directory in SALOME naming service
48 //=============================================================================
50 SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb)
52 MESSAGE("constructor");
53 _NS = new SALOME_NamingService(orb);
54 _ResManager = new SALOME_ResourcesManager(orb);
56 PortableServer::POA_var root_poa = PortableServer::POA::_the_root_poa();
57 PortableServer::POAManager_var pman = root_poa->the_POAManager();
58 PortableServer::POA_var my_poa;
60 CORBA::PolicyList policies;
62 PortableServer::ThreadPolicy_var threadPol =
63 root_poa->create_thread_policy(PortableServer::SINGLE_THREAD_MODEL);
64 policies[0] = PortableServer::ThreadPolicy::_duplicate(threadPol);
67 root_poa->create_POA("SThreadPOA",pman,policies);
69 PortableServer::ObjectId_var id = my_poa->activate_object(this);
70 CORBA::Object_var obj = my_poa->id_to_reference(id);
71 Engines::ContainerManager_var refContMan =
72 Engines::ContainerManager::_narrow(obj);
74 _NS->Register(refContMan,_ContainerManagerNameInNS);
75 MESSAGE("constructor end");
78 //=============================================================================
82 //=============================================================================
84 SALOME_ContainerManager::~SALOME_ContainerManager()
86 MESSAGE("destructor");
91 //=============================================================================
93 * shutdown all the containers, then the ContainerManager servant
95 //=============================================================================
97 void SALOME_ContainerManager::Shutdown()
100 ShutdownContainers();
101 PortableServer::ObjectId_var oid = _default_POA()->servant_to_id(this);
102 _default_POA()->deactivate_object(oid);
107 //=============================================================================
109 * Loop on all the containers listed in naming service, ask shutdown on each
111 //=============================================================================
113 void SALOME_ContainerManager::ShutdownContainers()
115 MESSAGE("ShutdownContainers");
116 _NS->Change_Directory("/Containers");
117 vector<string> vec = _NS->list_directory_recurs();
118 list<string> lstCont;
119 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++)
122 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
123 Engines::Container_var cont=Engines::Container::_narrow(obj);
124 if(!CORBA::is_nil(cont))
126 lstCont.push_back((*iter));
129 MESSAGE("Container list: ");
130 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++)
134 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++)
137 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
138 Engines::Container_var cont=Engines::Container::_narrow(obj);
139 if(!CORBA::is_nil(cont))
141 MESSAGE("ShutdownContainers: " << (*iter));
144 else MESSAGE("ShutdownContainers: no container ref for " << (*iter));
148 //=============================================================================
150 * Find a suitable Container in a list of machines, or start one
151 * \param params Machine Parameters required for the container
152 * \param possibleComputers list of machines usable for find or start
154 //=============================================================================
156 Engines::Container_ptr
157 SALOME_ContainerManager::
158 FindOrStartContainer(const Engines::MachineParameters& params,
159 const Engines::MachineList& possibleComputers)
162 string containerNameInNS;
163 char idc[3*sizeof(long)];
165 Engines::Container_ptr ret = FindContainer(params,possibleComputers);
166 if(!CORBA::is_nil(ret))
168 MESSAGE("Container doesn't exist try to launch it ...");
170 return StartContainer(params,possibleComputers,Engines::P_FIRST);
174 //=============================================================================
176 * Start a suitable Container in a list of machines
177 * \param params Machine Parameters required for the container
178 * \param possibleComputers list of machines usable for start
180 //=============================================================================
182 Engines::Container_ptr
183 SALOME_ContainerManager::
184 StartContainer(const Engines::MachineParameters& params,
185 const Engines::MachineList& possibleComputers,
186 Engines::ResPolicy policy)
188 #ifdef WITH_PACO_PARALLEL
189 std::string parallelLib(params.parallelLib);
190 if (parallelLib != "")
191 return FindOrStartParallelContainer(params, possibleComputers);
194 string containerNameInNS;
195 char idc[3*sizeof(long)];
196 Engines::Container_ptr ret = Engines::Container::_nil();
198 MESSAGE("SALOME_ContainerManager::StartContainer " <<
199 possibleComputers.length());
204 case Engines::P_FIRST:
205 theMachine=_ResManager->FindFirst(possibleComputers);
207 case Engines::P_CYCL:
208 theMachine=_ResManager->FindNext(possibleComputers);
210 case Engines::P_BEST:
211 theMachine=_ResManager->FindBest(possibleComputers);
215 catch( const SALOME_Exception &ex ){
217 return Engines::Container::_nil();
220 MESSAGE("try to launch it on " << theMachine);
222 // Get Id for container: a parallel container registers in Naming Service
223 // on the machine where is process 0. ContainerManager does'nt know the name
224 // of this machine before the launch of the parallel container. So to get
225 // the IOR of the parallel container in Naming Service, ContainerManager
226 // gives a unique Id. The parallel container registers his name under
227 // /ContainerManager/Id directory in NamingService
229 id = GetIdForContainer();
233 MESSAGE("SALOME_ContainerManager::StartContainer : " <<
234 "no possible computer");
235 return Engines::Container::_nil();
237 else if(theMachine==GetHostname())
238 command=_ResManager->BuildCommandToLaunchLocalContainer(params,id);
240 command = _ResManager->BuildCommandToLaunchRemoteContainer(theMachine,params,id);
242 _ResManager->RmTmpFile();
243 int status=system(command.c_str());
245 MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed " <<
246 "(system command status -1)");
247 return Engines::Container::_nil();
249 else if (status == 217){
250 MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed " <<
251 "(system command status 217)");
252 return Engines::Container::_nil();
255 int count=TIME_OUT_TO_LAUNCH_CONT;
256 MESSAGE("count = "<<count);
257 while ( CORBA::is_nil(ret) && count ){
265 MESSAGE( count << ". Waiting for container on " << theMachine);
268 containerNameInNS = "/ContainerManager/id";
269 sprintf(idc,"%ld",id);
270 containerNameInNS += idc;
273 containerNameInNS = _NS->BuildContainerNameForNS(params,theMachine.c_str());
275 SCRUTE(containerNameInNS);
276 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
277 ret=Engines::Container::_narrow(obj);
280 if ( CORBA::is_nil(ret) )
281 MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed");
287 //=============================================================================
289 * Start a suitable Container in a list of machines
290 * \param params Machine Parameters required for the container
291 * \param possibleComputers list of machines usable for start
293 //=============================================================================
295 Engines::Container_ptr
296 SALOME_ContainerManager::
297 StartContainer(const Engines::MachineParameters& params,
298 Engines::ResPolicy policy)
300 Engines::MachineList_var possibleComputers = GetFittingResources(params,"");
301 return StartContainer(params,possibleComputers,policy);
304 #ifdef WITH_PACO_PARALLEL
305 //=============================================================================
307 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
308 * \param params Machine Parameters required for the container
309 * \param possibleComputers list of machines usable for find or start
311 * \return CORBA container reference.
313 //=============================================================================
314 Engines::Container_ptr
315 SALOME_ContainerManager::
316 FindOrStartParallelContainer(const Engines::MachineParameters& params_const,
317 const Engines::MachineList& possibleComputers)
319 CORBA::Object_var obj;
320 Engines::Container_ptr ret = Engines::Container::_nil();
321 Engines::MachineParameters params(params_const);
323 // Step 1 : Try to find a suitable container
324 // Currently not as good as could be since
325 // we have to verified the number of nodes of the container
326 // if a user tell that.
327 ret = FindContainer(params, possibleComputers);
329 if(CORBA::is_nil(ret)) {
330 // Step 2 : Starting a new parallel container
331 INFOS("[FindOrStartParallelContainer] Starting a parallel container");
333 // Step 2.1 : Choose a computer
334 string theMachine = _ResManager->FindFirst(possibleComputers);
335 if(theMachine == "") {
336 INFOS("[FindOrStartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
337 INFOS("[FindOrStartParallelContainer] No possible computer found");
338 INFOS("[FindOrStartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
341 INFOS("[FindOrStartParallelContainer] on machine : " << theMachine);
343 if(theMachine == GetHostname()) {
344 // Step 3 : starting parallel container proxy
345 params.hostname = CORBA::string_dup(theMachine.c_str());
346 Engines::MachineParameters params_proxy(params);
347 command = _ResManager->BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerProxy", params_proxy, "xterm");
348 // LaunchParallelContainer uses this value to know if it launches the proxy or the nodes
349 params_proxy.nb_component_nodes = 0;
350 obj = LaunchParallelContainer(command, params_proxy, _NS->ContainerName(params));
351 ret = Engines::Container::_narrow(obj);
353 // Step 4 : starting parallel container nodes
354 command = _ResManager->BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerNode", params, "xterm");
355 string name = _NS->ContainerName(params) + "Node";
356 LaunchParallelContainer(command, params, name);
358 // Step 5 : connecting nodes and the proxy to actually create a parallel container
360 for (int i = 0; i < params.nb_component_nodes; i++) {
364 snprintf(buffer,5,"%d",i);
366 _snprintf(buffer,5,"%d",i);
368 string name_cont = name + string(buffer);
370 string theNodeMachine(CORBA::string_dup(params.hostname));
371 string containerNameInNS = _NS->BuildContainerNameForNS(name_cont.c_str(),theNodeMachine.c_str());
372 int count = TIME_OUT_TO_LAUNCH_CONT;
373 obj = _NS->Resolve(containerNameInNS.c_str());
374 while (CORBA::is_nil(obj) && count) {
375 INFOS("[FindOrStartParallelContainer] CONNECTION FAILED !!!!!!!!!!!!!!!!!!!!!!!!");
382 obj = _NS->Resolve(containerNameInNS.c_str());
385 PaCO::InterfaceParallel_var node = PaCO::InterfaceParallel::_narrow(obj);
386 MESSAGE("[FindOrStartParallelContainer] Deploying node : " << name);
390 catch(CORBA::SystemException& e)
392 INFOS("Caught CORBA::SystemException. : " << e);
394 catch(PortableServer::POA::ServantAlreadyActive&)
396 INFOS("Caught CORBA::ServantAlreadyActiveException");
398 catch(CORBA::Exception&)
400 INFOS("Caught CORBA::Exception.");
402 catch(std::exception& exc)
404 INFOS("Caught std::exception - "<<exc.what());
408 INFOS("Caught unknown exception.");
410 INFOS("[FindOrStartParallelContainer] node " << name << " deployed");
414 INFOS("[FindOrStartParallelContainer] Currently parallel containers are launched only on the local host");
421 //=============================================================================
423 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
424 * \param params Machine Parameters required for the container
425 * \param possibleComputers list of machines usable for find or start
427 * \return CORBA container reference.
429 //=============================================================================
430 Engines::Container_ptr
431 SALOME_ContainerManager::
432 FindOrStartParallelContainer(const Engines::MachineParameters& params,
433 const Engines::MachineList& possibleComputers)
435 Engines::Container_ptr ret = Engines::Container::_nil();
436 INFOS("[FindOrStartParallelContainer] is disabled !");
437 INFOS("[FindOrStartParallelContainer] recompile SALOME Kernel to enable parallel extension");
442 //=============================================================================
446 //=============================================================================
448 Engines::MachineList *
449 SALOME_ContainerManager::
450 GetFittingResources(const Engines::MachineParameters& params,
451 const char *componentName)
453 MESSAGE("SALOME_ContainerManager::GetFittingResources");
454 Engines::MachineList *ret=new Engines::MachineList;
458 vec = _ResManager->GetFittingResources(params,componentName);
460 catch(const SALOME_Exception &ex)
462 INFOS("Caught exception.");
463 THROW_SALOME_CORBA_EXCEPTION(ex.what(),SALOME::BAD_PARAM);
467 // MESSAGE("Machine list length "<<vec.size());
468 ret->length(vec.size());
469 for(unsigned int i=0;i<vec.size();i++)
471 (*ret)[i]=(vec[i]).c_str();
476 //=============================================================================
480 //=============================================================================
483 SALOME_ContainerManager::
484 FindFirst(const Engines::MachineList& possibleComputers)
486 string theMachine=_ResManager->FindFirst(possibleComputers);
487 return CORBA::string_dup(theMachine.c_str());
490 //=============================================================================
494 //=============================================================================
496 Engines::Container_ptr
497 SALOME_ContainerManager::
498 FindContainer(const Engines::MachineParameters& params,
499 const char *theMachine)
501 string containerNameInNS(_NS->BuildContainerNameForNS(params,theMachine));
502 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
503 if( !CORBA::is_nil(obj) )
504 return Engines::Container::_narrow(obj);
506 return Engines::Container::_nil();
509 //=============================================================================
513 //=============================================================================
515 Engines::Container_ptr
516 SALOME_ContainerManager::
517 FindContainer(const Engines::MachineParameters& params,
518 const Engines::MachineList& possibleComputers)
520 MESSAGE("FindContainer "<<possibleComputers.length());
521 for(unsigned int i=0;i<possibleComputers.length();i++)
523 MESSAGE("FindContainer possible " << possibleComputers[i]);
524 Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
525 if( !CORBA::is_nil(cont) )
528 MESSAGE("FindContainer: not found");
529 return Engines::Container::_nil();
532 //=============================================================================
533 /*! This method launches the parallel container.
534 * It will may be placed on the ressources manager.
536 * \param command to launch
537 * \param container's parameters
538 * \param name of the container
540 * \return CORBA container reference
542 //=============================================================================
544 SALOME_ContainerManager::LaunchParallelContainer(const std::string& command,
545 const Engines::MachineParameters& params,
546 const std::string& name)
548 CORBA::Object_ptr obj = CORBA::Object::_nil();
549 string containerNameInNS;
551 if (params.nb_component_nodes == 0) {
552 INFOS("[LaunchParallelContainer] launching the proxy of the parallel container");
553 int status = system(command.c_str());
555 INFOS("[LaunchParallelContainer] failed : system command status -1");
557 else if (status == 217) {
558 INFOS("[LaunchParallelContainer] failed : system command status 217");
561 int count = TIME_OUT_TO_LAUNCH_CONT;
562 string theMachine(CORBA::string_dup(params.hostname));
563 containerNameInNS = _NS->BuildContainerNameForNS((char*) name.c_str(),theMachine.c_str());
565 INFOS("[LaunchContainer] Waiting for Parallel Container proxy on " << theMachine);
566 while (CORBA::is_nil(obj) && count) {
573 obj = _NS->Resolve(containerNameInNS.c_str());
577 INFOS("[LaunchParallelContainer] launching the nodes of the parallel container");
578 int status = system(command.c_str());
580 INFOS("[LaunchParallelContainer] failed : system command status -1");
582 else if (status == 217) {
583 INFOS("[LaunchParallelContainer] failed : system command status 217");
585 // We are waiting all the nodes
586 for (int i = 0; i < params.nb_component_nodes; i++) {
587 obj = CORBA::Object::_nil();
588 int count = TIME_OUT_TO_LAUNCH_CONT;
593 snprintf(buffer,5,"%d",i);
595 _snprintf(buffer,5,"%d",i);
598 string name_cont = name + string(buffer);
600 // I don't like this...
601 string theMachine(CORBA::string_dup(params.hostname));
602 containerNameInNS = _NS->BuildContainerNameForNS((char*) name_cont.c_str(),theMachine.c_str());
603 cerr << "[LaunchContainer] Waiting for Parllel Container node " << containerNameInNS << " on " << theMachine << endl;
604 while (CORBA::is_nil(obj) && count) {
611 obj = _NS->Resolve(containerNameInNS.c_str());
616 if ( CORBA::is_nil(obj) ) {
617 INFOS("[LaunchParallelContainer] failed");
622 //=============================================================================
624 * Get Id for container: a parallel container registers in Naming Service
625 * on the machine where is process 0. ContainerManager does'nt know the name
626 * of this machine before the launch of the parallel container. So to get
627 * the IOR of the parallel container in Naming Service, ContainerManager
628 * gives a unique Id. The parallel container registers his name under
629 * /ContainerManager/Id directory in NamingService
631 //=============================================================================
634 long SALOME_ContainerManager::GetIdForContainer(void)