1 // Copyright (C) 2007-2011 CEA/DEN, EDF R&D, OPEN CASCADE
3 // Copyright (C) 2003-2007 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
4 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU Lesser General Public
8 // License as published by the Free Software Foundation; either
9 // version 2.1 of the License.
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // Lesser General Public License for more details.
16 // You should have received a copy of the GNU Lesser General Public
17 // License along with this library; if not, write to the Free Software
18 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
23 #include "SALOME_ResourcesManager.hxx"
24 #include "Utils_ExceptHandlers.hxx"
25 #include "Utils_CorbaException.hxx"
43 #include <sys/types.h>
45 #include "utilities.h"
47 #define MAX_SIZE_FOR_HOSTNAME 256;
49 const char *SALOME_ResourcesManager::_ResourcesManagerNameInNS = "/ResourcesManager";
51 //=============================================================================
55 //=============================================================================
57 SALOME_ResourcesManager::
58 SALOME_ResourcesManager(CORBA::ORB_ptr orb,
59 PortableServer::POA_var poa,
60 SALOME_NamingService *ns,
61 const char *xmlFilePath) : _rm(xmlFilePath)
63 MESSAGE("SALOME_ResourcesManager constructor");
65 _orb = CORBA::ORB::_duplicate(orb) ;
66 _poa = PortableServer::POA::_duplicate(poa) ;
67 PortableServer::ObjectId_var id = _poa->activate_object(this);
68 CORBA::Object_var obj = _poa->id_to_reference(id);
69 Engines::ResourcesManager_var refContMan = Engines::ResourcesManager::_narrow(obj);
70 _NS->Register(refContMan,_ResourcesManagerNameInNS);
71 MESSAGE("SALOME_ResourcesManager constructor end");
74 //=============================================================================
76 * Standard constructor, parse resource file.
77 * - if ${APPLI} exists in environment,
78 * look for ${HOME}/${APPLI}/CatalogResources.xml
79 * - else look for default:
80 * ${KERNEL_ROOT_DIR}/share/salome/resources/kernel/CatalogResources.xml
81 * - parse XML resource file.
83 //=============================================================================
85 SALOME_ResourcesManager::SALOME_ResourcesManager(CORBA::ORB_ptr orb,
86 PortableServer::POA_var poa,
87 SALOME_NamingService *ns) : _rm()
89 MESSAGE("SALOME_ResourcesManager constructor");
91 _orb = CORBA::ORB::_duplicate(orb) ;
92 _poa = PortableServer::POA::_duplicate(poa) ;
93 PortableServer::ObjectId_var id = _poa->activate_object(this);
94 CORBA::Object_var obj = _poa->id_to_reference(id);
95 Engines::ResourcesManager_var refContMan = Engines::ResourcesManager::_narrow(obj);
96 _NS->Register(refContMan,_ResourcesManagerNameInNS);
98 MESSAGE("SALOME_ResourcesManager constructor end");
101 //=============================================================================
103 * Standard Destructor
105 //=============================================================================
107 SALOME_ResourcesManager::~SALOME_ResourcesManager()
109 MESSAGE("SALOME_ResourcesManager destructor");
113 //=============================================================================
115 * shutdown all the containers, then the ContainerManager servant
117 //=============================================================================
119 void SALOME_ResourcesManager::Shutdown()
122 _NS->Destroy_Name(_ResourcesManagerNameInNS);
123 PortableServer::ObjectId_var oid = _poa->servant_to_id(this);
124 _poa->deactivate_object(oid);
127 //=============================================================================
128 //! get the name of resources fitting the specified constraints (params)
130 * If hostname specified, check it is local or known in resources catalog.
133 * - select first machines with corresponding OS (all machines if
134 * parameter OS empty),
135 * - then select the sublist of machines on which the component is known
136 * (if the result is empty, that probably means that the inventory of
137 * components is probably not done, so give complete list from previous step)
139 //=============================================================================
141 Engines::ResourceList *
142 SALOME_ResourcesManager::GetFittingResources(const Engines::ResourceParameters& params)
144 MESSAGE("ResourcesManager::GetFittingResources");
145 Engines::ResourceList * ret = new Engines::ResourceList;
149 p.name = params.name;
150 p.hostname = params.hostname;
152 p.nb_proc = params.nb_proc;
153 p.nb_node = params.nb_node;
154 p.nb_proc_per_node = params.nb_proc_per_node;
155 p.cpu_clock = params.cpu_clock;
156 p.mem_mb = params.mem_mb;
157 for(unsigned int i=0; i<params.componentList.length(); i++)
158 p.componentList.push_back(std::string(params.componentList[i]));
159 for(unsigned int i=0; i<params.resList.length(); i++)
160 p.resourceList.push_back(std::string(params.resList[i]));
164 // Call C++ ResourceManager
165 std::vector <std::string> vec = _rm.GetFittingResources(p);
168 ret->length(vec.size());
169 for(unsigned int i=0;i<vec.size();i++)
170 (*ret)[i] = (vec[i]).c_str();
172 catch(const ResourcesException &ex)
174 INFOS("Caught exception in GetFittingResources C++: " << ex.msg);
175 THROW_SALOME_CORBA_EXCEPTION(ex.msg.c_str(),SALOME::BAD_PARAM);
181 //=============================================================================
183 * dynamically obtains the first machines
185 //=============================================================================
188 SALOME_ResourcesManager::FindFirst(const Engines::ResourceList& listOfResources)
191 std::vector<std::string> rl;
192 for(unsigned int i=0; i<listOfResources.length(); i++)
193 rl.push_back(std::string(listOfResources[i]));
195 return CORBA::string_dup(_rm.Find("first", rl).c_str());
199 SALOME_ResourcesManager::Find(const char* policy, const Engines::ResourceList& listOfResources)
202 std::vector<std::string> rl;
203 for(unsigned int i=0; i<listOfResources.length(); i++)
204 rl.push_back(std::string(listOfResources[i]));
206 return CORBA::string_dup(_rm.Find(policy, rl).c_str());
209 Engines::ResourceDefinition*
210 SALOME_ResourcesManager::GetResourceDefinition(const char * name)
212 ParserResourcesType resource = _rm.GetResourcesDescr(name);
213 Engines::ResourceDefinition *p_ptr = new Engines::ResourceDefinition;
215 p_ptr->name = CORBA::string_dup(resource.Name.c_str());
216 p_ptr->hostname = CORBA::string_dup(resource.HostName.c_str());
217 if( resource.Protocol == rsh )
218 p_ptr->protocol = "rsh";
219 else if( resource.Protocol == ssh )
220 p_ptr->protocol = "ssh";
221 else if( resource.Protocol == srun )
222 p_ptr->protocol = "srun";
223 if( resource.ClusterInternalProtocol == rsh )
224 p_ptr->iprotocol = "rsh";
225 else if( resource.ClusterInternalProtocol == ssh )
226 p_ptr->iprotocol = "ssh";
227 else if( resource.ClusterInternalProtocol == srun )
228 p_ptr->iprotocol = "srun";
229 p_ptr->username = CORBA::string_dup(resource.UserName.c_str());
230 p_ptr->applipath = CORBA::string_dup(resource.AppliPath.c_str());
231 p_ptr->componentList.length(resource.ComponentsList.size());
232 for(unsigned int i=0;i<resource.ComponentsList.size();i++)
233 p_ptr->componentList[i] = CORBA::string_dup(resource.ComponentsList[i].c_str());
234 p_ptr->OS = CORBA::string_dup(resource.OS.c_str());
235 p_ptr->mem_mb = resource.DataForSort._memInMB;
236 p_ptr->cpu_clock = resource.DataForSort._CPUFreqMHz;
237 p_ptr->nb_proc_per_node = resource.DataForSort._nbOfProcPerNode;
238 p_ptr->nb_node = resource.DataForSort._nbOfNodes;
240 if( resource.mpi == lam )
241 p_ptr->mpiImpl = "lam";
242 else if( resource.mpi == mpich1 )
243 p_ptr->mpiImpl = "mpich1";
244 else if( resource.mpi == mpich2 )
245 p_ptr->mpiImpl = "mpich2";
246 else if( resource.mpi == openmpi )
247 p_ptr->mpiImpl = "openmpi";
248 else if( resource.mpi == slurmmpi )
249 p_ptr->mpiImpl = "slurmmpi";
250 else if( resource.mpi == prun )
251 p_ptr->mpiImpl = "prun";
253 if( resource.Batch == pbs )
254 p_ptr->batch = "pbs";
255 else if( resource.Batch == lsf )
256 p_ptr->batch = "lsf";
257 else if( resource.Batch == sge )
258 p_ptr->batch = "sge";
259 else if( resource.Batch == ccc )
260 p_ptr->batch = "ccc";
261 else if( resource.Batch == slurm )
262 p_ptr->batch = "slurm";
263 else if( resource.Batch == ssh_batch )
264 p_ptr->batch = "ssh";
265 else if( resource.Batch == ll )
272 SALOME_ResourcesManager::AddResource(const Engines::ResourceDefinition& new_resource,
273 CORBA::Boolean write,
274 const char * xml_file)
276 ParserResourcesType resource;
277 resource.Name = new_resource.name.in();
278 resource.HostName = new_resource.hostname.in();
279 resource.OS = new_resource.OS.in();
280 resource.AppliPath = new_resource.applipath.in();
281 resource.DataForSort._memInMB = new_resource.mem_mb;
282 resource.DataForSort._CPUFreqMHz = new_resource.cpu_clock;
283 resource.DataForSort._nbOfNodes = new_resource.nb_node;
284 resource.DataForSort._nbOfProcPerNode = new_resource.nb_proc_per_node;
285 resource.UserName = new_resource.username.in();
287 std::string aBatch = new_resource.batch.in();
289 resource.Batch = pbs;
290 else if (aBatch == "lsf")
291 resource.Batch = lsf;
292 else if (aBatch == "sge")
293 resource.Batch = sge;
294 else if (aBatch == "slurm")
295 resource.Batch = slurm;
296 else if (aBatch == "ccc")
297 resource.Batch = ccc;
298 else if (aBatch == "ssh_batch")
299 resource.Batch = ssh_batch;
300 else if (aBatch == "ll")
302 else if (aBatch == "")
303 resource.Batch = none;
305 INFOS("Bad Batch definition in AddResource: " << aBatch);
306 std::string message("Bad Batch definition in AddResource: ");
308 THROW_SALOME_CORBA_EXCEPTION(message.c_str(),SALOME::BAD_PARAM);
311 std::string anMpi = new_resource.mpiImpl.in();
314 else if (anMpi == "mpich1")
315 resource.mpi = mpich1;
316 else if (anMpi == "mpich2")
317 resource.mpi = mpich2;
318 else if (anMpi == "openmpi")
319 resource.mpi = openmpi;
320 else if (anMpi == "slurmmpi")
321 resource.mpi = slurmmpi;
322 else if (anMpi == "prun")
324 else if (anMpi == "")
325 resource.mpi = nompi;
327 INFOS("Bad MPI definition in AddResource: " << anMpi);
328 std::string message("Bad MPI definition in AddResource: ");
330 THROW_SALOME_CORBA_EXCEPTION(message.c_str(),SALOME::BAD_PARAM);
333 std::string mode_str = new_resource.mode.in();
334 if (mode_str == "interactive")
335 resource.Mode = interactive;
336 else if (mode_str == "batch")
337 resource.Mode = batch;
338 else if (mode_str == "")
339 resource.Mode = interactive;
341 INFOS("Bad mode definition in AddResource: " << mode_str);
342 std::string message("Bad mode definition in AddResource: ");
344 THROW_SALOME_CORBA_EXCEPTION(message.c_str(),SALOME::BAD_PARAM);
347 std::string protocol = new_resource.protocol.in();
348 if (protocol == "rsh")
349 resource.Protocol = rsh;
350 else if (protocol == "ssh")
351 resource.Protocol = ssh;
352 else if (protocol == "srun")
353 resource.Protocol = srun;
354 else if (protocol == "")
355 resource.Protocol = rsh;
357 INFOS("Bad protocol definition in AddResource: " << protocol);
358 std::string message("Bad protocol definition in AddResource: ");
360 THROW_SALOME_CORBA_EXCEPTION(message.c_str(),SALOME::BAD_PARAM);
363 std::string iprotocol = new_resource.iprotocol.in();
364 if (iprotocol == "rsh")
365 resource.ClusterInternalProtocol = rsh;
366 else if (iprotocol == "ssh")
367 resource.ClusterInternalProtocol = ssh;
368 else if (iprotocol == "srun")
369 resource.ClusterInternalProtocol = srun;
370 else if (iprotocol == "")
371 resource.ClusterInternalProtocol = rsh;
373 INFOS("Bad iprotocol definition in AddResource: " << iprotocol);
374 std::string message("Bad iprotocol definition in AddResource: ");
375 message += iprotocol;
376 THROW_SALOME_CORBA_EXCEPTION(message.c_str(),SALOME::BAD_PARAM);
379 for (CORBA::ULong i = 0; i < new_resource.componentList.length(); i++)
380 resource.ComponentsList.push_back(new_resource.componentList[i].in());
382 _rm.AddResourceInCatalog(resource);
386 _rm.WriteInXmlFile(std::string(xml_file));
392 SALOME_ResourcesManager::RemoveResource(const char * resource_name,
393 CORBA::Boolean write,
394 const char * xml_file)
396 _rm.DeleteResourceInCatalog(resource_name);
399 _rm.WriteInXmlFile(std::string(xml_file));
405 SALOME_ResourcesManager::getMachineFile(std::string resource_name,
406 CORBA::Long nb_procs,
407 std::string parallelLib)
409 std::string machine_file_name("");
411 if (parallelLib == "Dummy")
413 MESSAGE("[getMachineFile] parallelLib is Dummy");
414 MapOfParserResourcesType resourcesList = _rm.GetList();
415 if (resourcesList.find(resource_name) != resourcesList.end())
417 ParserResourcesType resource = resourcesList[resource_name];
419 // Check if resource is cluster or not
420 if (resource.ClusterMembersList.empty())
422 //It is not a cluster so we create a cluster with one machine
423 ParserResourcesClusterMembersType fake_node;
424 fake_node.HostName = resource.HostName;
425 fake_node.Protocol = resource.Protocol;
426 fake_node.ClusterInternalProtocol = resource.ClusterInternalProtocol;
427 fake_node.UserName = resource.UserName;
428 fake_node.AppliPath = resource.AppliPath;
429 fake_node.DataForSort = resource.DataForSort;
431 resource.ClusterMembersList.push_front(fake_node);
434 // Creating list of machines for creating the machine file
435 std::list<std::string> list_of_machines;
436 std::list<ParserResourcesClusterMembersType>::iterator cluster_it =
437 resource.ClusterMembersList.begin();
438 while (cluster_it != resource.ClusterMembersList.end())
440 // For each member of the cluster we add a nbOfNodes * nbOfProcPerNode in the list
441 unsigned int number_of_proc = (*cluster_it).DataForSort._nbOfNodes *
442 (*cluster_it).DataForSort._nbOfProcPerNode;
443 for (unsigned int i = 0; i < number_of_proc; i++)
444 list_of_machines.push_back((*cluster_it).HostName);
448 // Creating machine file
449 machine_file_name = tmpnam(NULL);
450 std::ofstream machine_file(machine_file_name.c_str(), std::ios_base::out);
452 CORBA::Long machine_number = 0;
453 std::list<std::string>::iterator it = list_of_machines.begin();
454 while (machine_number != nb_procs)
456 // Adding a new node to the machine file
457 machine_file << *it << std::endl;
461 if (it == list_of_machines.end())
462 it = list_of_machines.begin();
467 INFOS("[getMachineFile] Error resource_name not found in resourcesList -> " << resource_name);
469 else if (parallelLib == "Mpi")
471 MESSAGE("[getMachineFile] parallelLib is Mpi");
473 MapOfParserResourcesType resourcesList = _rm.GetList();
474 if (resourcesList.find(resource_name) != resourcesList.end())
476 ParserResourcesType resource = resourcesList[resource_name];
477 // Check if resource is cluster or not
478 if (resource.ClusterMembersList.empty())
480 //It is not a cluster so we create a cluster with one machine
481 ParserResourcesClusterMembersType fake_node;
482 fake_node.HostName = resource.HostName;
483 fake_node.Protocol = resource.Protocol;
484 fake_node.ClusterInternalProtocol = resource.ClusterInternalProtocol;
485 fake_node.UserName = resource.UserName;
486 fake_node.AppliPath = resource.AppliPath;
487 fake_node.DataForSort = resource.DataForSort;
489 resource.ClusterMembersList.push_front(fake_node);
492 // Choose mpi implementation -> each MPI implementation has is own machinefile...
493 if (resource.mpi == lam)
495 // Creating machine file
496 machine_file_name = tmpnam(NULL);
497 std::ofstream machine_file(machine_file_name.c_str(), std::ios_base::out);
499 // We add all cluster machines to the file
500 std::list<ParserResourcesClusterMembersType>::iterator cluster_it =
501 resource.ClusterMembersList.begin();
502 while (cluster_it != resource.ClusterMembersList.end())
504 unsigned int number_of_proc = (*cluster_it).DataForSort._nbOfNodes *
505 (*cluster_it).DataForSort._nbOfProcPerNode;
506 machine_file << (*cluster_it).HostName << " cpu=" << number_of_proc << std::endl;
510 else if (resource.mpi == openmpi)
512 // Creating machine file
513 machine_file_name = tmpnam(NULL);
514 std::ofstream machine_file(machine_file_name.c_str(), std::ios_base::out);
516 // We add all cluster machines to the file
517 std::list<ParserResourcesClusterMembersType>::iterator cluster_it =
518 resource.ClusterMembersList.begin();
519 while (cluster_it != resource.ClusterMembersList.end())
521 unsigned int number_of_proc = (*cluster_it).DataForSort._nbOfNodes *
522 (*cluster_it).DataForSort._nbOfProcPerNode;
523 machine_file << (*cluster_it).HostName << " slots=" << number_of_proc << std::endl;
527 else if (resource.mpi == nompi)
529 INFOS("[getMachineFile] Error resource_name MPI implementation was defined for " << resource_name);
532 INFOS("[getMachineFile] Error resource_name MPI implementation not currenly handled for " << resource_name);
535 INFOS("[getMachineFile] Error resource_name not found in resourcesList -> " << resource_name);
538 INFOS("[getMachineFile] Error parallelLib is not handled -> " << parallelLib);
540 return machine_file_name;