1 // Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
2 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License as published by the Free Software Foundation; either
7 // version 2.1 of the License.
9 // This library is distributed in the hope that it will be useful
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 // Lesser General Public License for more details.
14 // You should have received a copy of the GNU Lesser General Public
15 // License along with this library; if not, write to the Free Software
16 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
20 #include "SALOME_ContainerManager.hxx"
21 #include "SALOME_NamingService.hxx"
23 #include <sys/types.h>
28 #include "Utils_CorbaException.hxx"
29 #include "Batch_Date.hxx"
31 #ifdef WITH_PACO_PARALLEL
35 #define TIME_OUT_TO_LAUNCH_CONT 21
39 vector<Engines::Container_ptr> SALOME_ContainerManager::_batchLaunchedContainers;
41 vector<Engines::Container_ptr>::iterator SALOME_ContainerManager::_batchLaunchedContainersIter;
43 const char *SALOME_ContainerManager::_ContainerManagerNameInNS =
46 //=============================================================================
50 * Define a CORBA single thread policy for the server, which avoid to deal
51 * with non thread-safe usage like Change_Directory in SALOME naming service
53 //=============================================================================
55 SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, SALOME_ResourcesManager *rm, SALOME_NamingService *ns)
57 MESSAGE("constructor");
62 PortableServer::POAManager_var pman = poa->the_POAManager();
63 _orb = CORBA::ORB::_duplicate(orb) ;
64 CORBA::PolicyList policies;
66 PortableServer::ThreadPolicy_var threadPol =
67 poa->create_thread_policy(PortableServer::SINGLE_THREAD_MODEL);
68 policies[0] = PortableServer::ThreadPolicy::_duplicate(threadPol);
70 _poa = poa->create_POA("SThreadPOA",pman,policies);
72 PortableServer::ObjectId_var id = _poa->activate_object(this);
73 CORBA::Object_var obj = _poa->id_to_reference(id);
74 Engines::ContainerManager_var refContMan =
75 Engines::ContainerManager::_narrow(obj);
77 _NS->Register(refContMan,_ContainerManagerNameInNS);
78 MESSAGE("constructor end");
81 //=============================================================================
85 //=============================================================================
87 SALOME_ContainerManager::~SALOME_ContainerManager()
89 MESSAGE("destructor");
92 //=============================================================================
94 * shutdown all the containers, then the ContainerManager servant
96 //=============================================================================
98 void SALOME_ContainerManager::Shutdown()
101 ShutdownContainers();
102 _NS->Destroy_Name(_ContainerManagerNameInNS);
103 PortableServer::ObjectId_var oid = _poa->servant_to_id(this);
104 _poa->deactivate_object(oid);
108 //=============================================================================
110 * shutdown the ContainerManager servant and kill the ContainerManager process
112 //=============================================================================
113 void SALOME_ContainerManager::ShutdownWithExit()
115 MESSAGE("ShutdownWithExit");
116 PortableServer::ObjectId_var oid = _default_POA()->servant_to_id(this);
117 _default_POA()->deactivate_object(oid);
120 exit( EXIT_SUCCESS );
123 //=============================================================================
125 * Loop on all the containers listed in naming service, ask shutdown on each
127 //=============================================================================
129 void SALOME_ContainerManager::ShutdownContainers()
131 MESSAGE("ShutdownContainers");
133 isOK = _NS->Change_Directory("/Containers");
135 vector<string> vec = _NS->list_directory_recurs();
136 list<string> lstCont;
137 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++){
139 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
140 Engines::Container_var cont=Engines::Container::_narrow(obj);
141 if(!CORBA::is_nil(cont)){
142 lstCont.push_back((*iter));
145 MESSAGE("Container list: ");
146 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++){
149 for(list<string>::iterator iter=lstCont.begin();iter!=lstCont.end();iter++){
151 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
152 Engines::Container_var cont=Engines::Container::_narrow(obj);
153 if(!CORBA::is_nil(cont)){
154 MESSAGE("ShutdownContainers: " << (*iter));
157 else MESSAGE("ShutdownContainers: no container ref for " << (*iter));
162 //=============================================================================
164 * Returns the PID of the container manager
166 //=============================================================================
167 CORBA::Long SALOME_ContainerManager::getPID()
169 return (CORBA::Long)getpid();
172 //=============================================================================
174 * Find a suitable Container in a list of machines, or start one
175 * \param params Machine Parameters required for the container
176 * \param possibleComputers list of machines usable for find or start
178 //=============================================================================
180 Engines::Container_ptr
181 SALOME_ContainerManager::
182 FindOrStartContainer(const Engines::MachineParameters& params,
183 const Engines::MachineList& possibleComputers)
185 Engines::Container_ptr ret = FindContainer(params,possibleComputers);
186 if(!CORBA::is_nil(ret))
188 MESSAGE("Container doesn't exist try to launch it ...");
190 return StartContainer(params,possibleComputers,Engines::P_FIRST);
194 //=============================================================================
196 * Start a suitable Container in a list of machines
197 * \param params Machine Parameters required for the container
198 * \param possibleComputers list of machines usable for start
200 //=============================================================================
202 Engines::Container_ptr
203 SALOME_ContainerManager::
204 StartContainer(const Engines::MachineParameters& params,
205 const Engines::MachineList& possibleComputers,
206 Engines::ResPolicy policy)
208 #ifdef WITH_PACO_PARALLEL
209 std::string parallelLib(params.parallelLib);
210 if (parallelLib != "")
211 return FindOrStartParallelContainer(params, possibleComputers);
214 string containerNameInNS;
215 char idc[3*sizeof(long)];
216 Engines::Container_ptr ret = Engines::Container::_nil();
218 MESSAGE("SALOME_ContainerManager::StartContainer " <<
219 possibleComputers.length());
224 case Engines::P_FIRST:
225 theMachine=_ResManager->FindFirst(possibleComputers);
227 case Engines::P_CYCL:
228 theMachine=_ResManager->FindNext(possibleComputers);
230 case Engines::P_BEST:
231 theMachine=_ResManager->FindBest(possibleComputers);
235 catch( const SALOME_Exception &ex ){
237 return Engines::Container::_nil();
240 MESSAGE("try to launch it on " << theMachine);
242 // Get Id for container: a parallel container registers in Naming Service
243 // on the machine where is process 0. ContainerManager does'nt know the name
244 // of this machine before the launch of the parallel container. So to get
245 // the IOR of the parallel container in Naming Service, ContainerManager
246 // gives a unique Id. The parallel container registers his name under
247 // /ContainerManager/Id directory in NamingService
249 id = GetIdForContainer();
253 MESSAGE("SALOME_ContainerManager::StartContainer : " <<
254 "no possible computer");
255 return Engines::Container::_nil();
257 else if(theMachine==GetHostname())
258 command=_ResManager->BuildCommandToLaunchLocalContainer(params,id);
260 command = _ResManager->BuildCommandToLaunchRemoteContainer(theMachine,params,id);
262 _ResManager->RmTmpFile();
263 int status=system(command.c_str());
265 MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed " <<
266 "(system command status -1)");
267 return Engines::Container::_nil();
269 else if (status == 217){
270 MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed " <<
271 "(system command status 217)");
272 return Engines::Container::_nil();
275 int count=TIME_OUT_TO_LAUNCH_CONT;
276 MESSAGE("count = "<<count);
277 while ( CORBA::is_nil(ret) && count ){
285 MESSAGE( count << ". Waiting for container on " << theMachine);
288 containerNameInNS = "/ContainerManager/id";
289 sprintf(idc,"%ld",id);
290 containerNameInNS += idc;
293 containerNameInNS = _NS->BuildContainerNameForNS(params,theMachine.c_str());
295 SCRUTE(containerNameInNS);
296 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
297 ret=Engines::Container::_narrow(obj);
300 if ( CORBA::is_nil(ret) )
301 MESSAGE("SALOME_LifeCycleCORBA::StartOrFindContainer rsh failed");
307 //=============================================================================
309 * Start a suitable Container in a list of machines
310 * \param params Machine Parameters required for the container
311 * \param possibleComputers list of machines usable for start
313 //=============================================================================
315 Engines::Container_ptr
316 SALOME_ContainerManager::
317 StartContainer(const Engines::MachineParameters& params,
318 Engines::ResPolicy policy,
319 const Engines::CompoList& componentList)
321 Engines::MachineList_var possibleComputers = _ResManager->GetFittingResources(params,componentList);
322 return StartContainer(params,possibleComputers,policy);
325 #ifdef WITH_PACO_PARALLEL
326 //=============================================================================
328 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
329 * \param params Machine Parameters required for the container
330 * \param possibleComputers list of machines usable for find or start
332 * \return CORBA container reference.
334 //=============================================================================
335 Engines::Container_ptr
336 SALOME_ContainerManager::
337 FindOrStartParallelContainer(const Engines::MachineParameters& params_const,
338 const Engines::MachineList& possibleComputers)
340 CORBA::Object_var obj;
341 Engines::Container_ptr ret = Engines::Container::_nil();
342 Engines::MachineParameters params(params_const);
344 // Step 1 : Try to find a suitable container
345 // Currently not as good as could be since
346 // we have to verified the number of nodes of the container
347 // if a user tell that.
348 ret = FindContainer(params, possibleComputers);
350 if(CORBA::is_nil(ret)) {
351 // Step 2 : Starting a new parallel container
352 INFOS("[FindOrStartParallelContainer] Starting a parallel container");
354 // Step 2.1 : Choose a computer
355 string theMachine = _ResManager->FindFirst(possibleComputers);
356 if(theMachine == "") {
357 INFOS("[FindOrStartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
358 INFOS("[FindOrStartParallelContainer] No possible computer found");
359 INFOS("[FindOrStartParallelContainer] !!!!!!!!!!!!!!!!!!!!!!!!!!");
362 INFOS("[FindOrStartParallelContainer] on machine : " << theMachine);
364 if(theMachine == GetHostname()) {
365 // Step 3 : starting parallel container proxy
366 params.hostname = CORBA::string_dup(theMachine.c_str());
367 Engines::MachineParameters params_proxy(params);
368 command = _ResManager->BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerProxy", params_proxy, "xterm");
369 // LaunchParallelContainer uses this value to know if it launches the proxy or the nodes
370 params_proxy.nb_component_nodes = 0;
371 obj = LaunchParallelContainer(command, params_proxy, _NS->ContainerName(params));
372 ret = Engines::Container::_narrow(obj);
374 // Step 4 : starting parallel container nodes
375 command = _ResManager->BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerNode", params, "xterm");
376 string name = _NS->ContainerName(params) + "Node";
377 LaunchParallelContainer(command, params, name);
379 // Step 5 : connecting nodes and the proxy to actually create a parallel container
381 for (int i = 0; i < params.nb_component_nodes; i++) {
385 snprintf(buffer,5,"%d",i);
387 _snprintf(buffer,5,"%d",i);
389 string name_cont = name + string(buffer);
391 string theNodeMachine(CORBA::string_dup(params.hostname));
392 string containerNameInNS = _NS->BuildContainerNameForNS(name_cont.c_str(),theNodeMachine.c_str());
393 int count = TIME_OUT_TO_LAUNCH_CONT;
394 obj = _NS->Resolve(containerNameInNS.c_str());
395 while (CORBA::is_nil(obj) && count) {
396 INFOS("[FindOrStartParallelContainer] CONNECTION FAILED !!!!!!!!!!!!!!!!!!!!!!!!");
403 obj = _NS->Resolve(containerNameInNS.c_str());
406 PaCO::InterfaceParallel_var node = PaCO::InterfaceParallel::_narrow(obj);
407 MESSAGE("[FindOrStartParallelContainer] Deploying node : " << name);
411 catch(CORBA::SystemException& e)
413 INFOS("Caught CORBA::SystemException. : " << e);
415 catch(PortableServer::POA::ServantAlreadyActive&)
417 INFOS("Caught CORBA::ServantAlreadyActiveException");
419 catch(CORBA::Exception&)
421 INFOS("Caught CORBA::Exception.");
423 catch(std::exception& exc)
425 INFOS("Caught std::exception - "<<exc.what());
429 INFOS("Caught unknown exception.");
431 INFOS("[FindOrStartParallelContainer] node " << name << " deployed");
435 INFOS("[FindOrStartParallelContainer] Currently parallel containers are launched only on the local host");
442 //=============================================================================
444 * Find or Start a suitable PaCO++ Parallel Container in a list of machines.
445 * \param params Machine Parameters required for the container
446 * \param possibleComputers list of machines usable for find or start
448 * \return CORBA container reference.
450 //=============================================================================
451 Engines::Container_ptr
452 SALOME_ContainerManager::
453 FindOrStartParallelContainer(const Engines::MachineParameters& params,
454 const Engines::MachineList& possibleComputers)
456 Engines::Container_ptr ret = Engines::Container::_nil();
457 INFOS("[FindOrStartParallelContainer] is disabled !");
458 INFOS("[FindOrStartParallelContainer] recompile SALOME Kernel to enable parallel extension");
463 //=============================================================================
465 * Give a suitable Container in a list of machines
466 * \param params Machine Parameters required for the container
467 * \param possibleComputers list of machines usable for start
469 //=============================================================================
471 Engines::Container_ptr
472 SALOME_ContainerManager::
473 GiveContainer(const Engines::MachineParameters& params,
474 Engines::ResPolicy policy,
475 const Engines::CompoList& componentList)
477 char *valenv=getenv("SALOME_BATCH");
479 if (strcmp(valenv,"1")==0)
481 if(_batchLaunchedContainers.empty())
482 fillBatchLaunchedContainers();
483 return *(_batchLaunchedContainersIter++);
485 return StartContainer(params,policy,componentList);
488 //=============================================================================
492 //=============================================================================
494 Engines::Container_ptr
495 SALOME_ContainerManager::
496 FindContainer(const Engines::MachineParameters& params,
497 const char *theMachine)
499 string containerNameInNS(_NS->BuildContainerNameForNS(params,theMachine));
500 CORBA::Object_var obj = _NS->Resolve(containerNameInNS.c_str());
501 if( !CORBA::is_nil(obj) )
502 return Engines::Container::_narrow(obj);
504 return Engines::Container::_nil();
507 //=============================================================================
511 //=============================================================================
513 Engines::Container_ptr
514 SALOME_ContainerManager::
515 FindContainer(const Engines::MachineParameters& params,
516 const Engines::MachineList& possibleComputers)
518 MESSAGE("FindContainer "<<possibleComputers.length());
519 for(unsigned int i=0;i<possibleComputers.length();i++)
521 MESSAGE("FindContainer possible " << possibleComputers[i]);
522 Engines::Container_ptr cont = FindContainer(params,possibleComputers[i]);
523 if( !CORBA::is_nil(cont) )
526 MESSAGE("FindContainer: not found");
527 return Engines::Container::_nil();
530 //=============================================================================
531 /*! This method launches the parallel container.
532 * It will may be placed on the ressources manager.
534 * \param command to launch
535 * \param container's parameters
536 * \param name of the container
538 * \return CORBA container reference
540 //=============================================================================
542 SALOME_ContainerManager::LaunchParallelContainer(const std::string& command,
543 const Engines::MachineParameters& params,
544 const std::string& name)
546 CORBA::Object_ptr obj = CORBA::Object::_nil();
547 string containerNameInNS;
549 if (params.nb_component_nodes == 0) {
550 INFOS("[LaunchParallelContainer] launching the proxy of the parallel container");
551 int status = system(command.c_str());
553 INFOS("[LaunchParallelContainer] failed : system command status -1");
555 else if (status == 217) {
556 INFOS("[LaunchParallelContainer] failed : system command status 217");
559 int count = TIME_OUT_TO_LAUNCH_CONT;
560 string theMachine(CORBA::string_dup(params.hostname));
561 containerNameInNS = _NS->BuildContainerNameForNS((char*) name.c_str(),theMachine.c_str());
563 INFOS("[LaunchContainer] Waiting for Parallel Container proxy on " << theMachine);
564 while (CORBA::is_nil(obj) && count) {
571 obj = _NS->Resolve(containerNameInNS.c_str());
575 INFOS("[LaunchParallelContainer] launching the nodes of the parallel container");
576 int status = system(command.c_str());
578 INFOS("[LaunchParallelContainer] failed : system command status -1");
580 else if (status == 217) {
581 INFOS("[LaunchParallelContainer] failed : system command status 217");
583 // We are waiting all the nodes
584 for (int i = 0; i < params.nb_component_nodes; i++) {
585 obj = CORBA::Object::_nil();
586 int count = TIME_OUT_TO_LAUNCH_CONT;
591 snprintf(buffer,5,"%d",i);
593 _snprintf(buffer,5,"%d",i);
596 string name_cont = name + string(buffer);
598 // I don't like this...
599 string theMachine(CORBA::string_dup(params.hostname));
600 containerNameInNS = _NS->BuildContainerNameForNS((char*) name_cont.c_str(),theMachine.c_str());
601 cerr << "[LaunchContainer] Waiting for Parllel Container node " << containerNameInNS << " on " << theMachine << endl;
602 while (CORBA::is_nil(obj) && count) {
609 obj = _NS->Resolve(containerNameInNS.c_str());
614 if ( CORBA::is_nil(obj) ) {
615 INFOS("[LaunchParallelContainer] failed");
620 //=============================================================================
622 * Get Id for container: a parallel container registers in Naming Service
623 * on the machine where is process 0. ContainerManager does'nt know the name
624 * of this machine before the launch of the parallel container. So to get
625 * the IOR of the parallel container in Naming Service, ContainerManager
626 * gives a unique Id. The parallel container registers his name under
627 * /ContainerManager/Id directory in NamingService
629 //=============================================================================
632 long SALOME_ContainerManager::GetIdForContainer(void)
638 void SALOME_ContainerManager::fillBatchLaunchedContainers()
640 _batchLaunchedContainers.clear();
641 _NS->Change_Directory("/Containers");
642 vector<string> vec = _NS->list_directory_recurs();
643 for(vector<string>::iterator iter = vec.begin();iter!=vec.end();iter++){
644 CORBA::Object_var obj=_NS->Resolve((*iter).c_str());
645 Engines::Container_ptr cont=Engines::Container::_narrow(obj);
646 if(!CORBA::is_nil(cont)){
647 _batchLaunchedContainers.push_back(cont);
650 _batchLaunchedContainersIter=_batchLaunchedContainers.begin();