1 // Copyright (C) 2007-2016 CEA/DEN, EDF R&D, OPEN CASCADE
3 // Copyright (C) 2003-2007 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
4 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU Lesser General Public
8 // License as published by the Free Software Foundation; either
9 // version 2.1 of the License, or (at your option) any later version.
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // Lesser General Public License for more details.
16 // You should have received a copy of the GNU Lesser General Public
17 // License along with this library; if not, write to the Free Software
18 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
30 #include <libbatch/BatchManagerCatalog.hxx>
31 #include <libbatch/FactBatchManager.hxx>
32 #include <libbatch/BatchManager.hxx>
35 #include "Basics_Utils.hxx"
36 #include "Basics_DirUtils.hxx"
37 #include "SALOME_Launcher_Handler.hxx"
38 #include "Launcher.hxx"
39 #include "Launcher_Job_Command.hxx"
40 #include "Launcher_XML_Persistence.hxx"
44 //=============================================================================
48 * Define a CORBA single thread policy for the server, which avoid to deal
49 * with non thread-safe usage like Change_Directory in SALOME naming service
51 //=============================================================================
52 Launcher_cpp::Launcher_cpp()
54 LAUNCHER_MESSAGE("Launcher_cpp constructor");
56 _job_cpt_mutex = new pthread_mutex_t();
57 pthread_mutex_init(_job_cpt_mutex, NULL);
60 //=============================================================================
64 //=============================================================================
65 Launcher_cpp::~Launcher_cpp()
67 LAUNCHER_MESSAGE("Launcher_cpp destructor");
69 std::map<int, Launcher::Job *>::const_iterator it_job;
70 for(it_job = _launcher_job_map.begin(); it_job != _launcher_job_map.end(); it_job++)
71 delete it_job->second;
72 std::map <int, Batch::BatchManager * >::const_iterator it1;
73 for(it1=_batchmap.begin();it1!=_batchmap.end();it1++)
77 pthread_mutex_destroy(_job_cpt_mutex);
78 delete _job_cpt_mutex;
83 //=============================================================================
85 * Add a job into the launcher - check resource and choose one
87 //=============================================================================
89 Launcher_cpp::createJob(Launcher::Job * new_job)
91 LAUNCHER_MESSAGE("Creating a new job");
92 // Add job to the jobs map
93 pthread_mutex_lock(_job_cpt_mutex);
94 new_job->setNumber(_job_cpt);
96 pthread_mutex_unlock(_job_cpt_mutex);
97 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(new_job->getNumber());
98 if (it_job == _launcher_job_map.end())
99 _launcher_job_map[new_job->getNumber()] = new_job;
102 LAUNCHER_INFOS("A job as already the same id: " << new_job->getNumber());
104 throw LauncherException("A job as already the same id - job is not created !");
106 LAUNCHER_MESSAGE("New Job created");
109 //=============================================================================
113 //=============================================================================
115 Launcher_cpp::launchJob(int job_id)
117 LAUNCHER_MESSAGE("Launch a job");
119 // Check if job exist
120 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(job_id);
121 if (it_job == _launcher_job_map.end())
123 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
124 throw LauncherException("Cannot find the job, is it created ?");
127 Launcher::Job * job = it_job->second;
129 // Check job state (cannot launch a job already launched...)
130 if (job->getState() != "CREATED")
132 LAUNCHER_INFOS("Bad state of the job: " << job->getState());
133 throw LauncherException("Bad state of the job: " + job->getState());
136 // Third step search batch manager for the job into the map -> instantiate one if does not exist
138 std::map<int, Batch::BatchManager *>::const_iterator it = _batchmap.find(job_id);
139 if(it == _batchmap.end())
141 createBatchManagerForJob(job);
146 Batch::JobId batch_manager_job_id = _batchmap[job_id]->submitJob(*(job->getBatchJob()));
147 job->setBatchManagerJobId(batch_manager_job_id);
148 job->setState("QUEUED");
149 job->setReference(batch_manager_job_id.getReference());
151 catch(const Batch::GenericException &ex)
153 LAUNCHER_INFOS("Job is not launched, exception in submitJob: " << ex.message);
154 throw LauncherException(ex.message.c_str());
156 LAUNCHER_MESSAGE("Job launched");
159 //=============================================================================
163 //=============================================================================
165 Launcher_cpp::getJobState(int job_id)
167 LAUNCHER_MESSAGE("Get job state");
169 // Check if job exist
170 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(job_id);
171 if (it_job == _launcher_job_map.end())
173 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
174 throw LauncherException("Cannot find the job, is it created ?");
177 Launcher::Job * job = it_job->second;
182 state = job->updateJobState();
184 catch(const Batch::GenericException &ex)
186 LAUNCHER_INFOS("getJobState failed, exception: " << ex.message);
187 throw LauncherException(ex.message.c_str());
190 return state.c_str();
193 //=============================================================================
195 * Get job assigned hostnames
197 //=============================================================================
199 Launcher_cpp::getAssignedHostnames(int job_id)
201 LAUNCHER_MESSAGE("Get job assigned hostnames");
203 // Check if job exist
204 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(job_id);
205 if (it_job == _launcher_job_map.end())
207 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
208 throw LauncherException("Cannot find the job, is it created ?");
211 Launcher::Job * job = it_job->second;
212 std::string assigned_hostnames = job->getAssignedHostnames();
214 return assigned_hostnames.c_str();
217 //=============================================================================
219 * Get Job result - the result directory could be changed
221 //=============================================================================
223 Launcher_cpp::getJobResults(int job_id, std::string directory)
225 LAUNCHER_MESSAGE("Get Job results");
227 // Check if job exist
228 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(job_id);
229 if (it_job == _launcher_job_map.end())
231 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
232 throw LauncherException("Cannot find the job, is it created ?");
235 Launcher::Job * job = it_job->second;
236 std::string resource_name = job->getResourceDefinition().Name;
240 _batchmap[job_id]->importOutputFiles(*(job->getBatchJob()), directory);
242 _batchmap[job_id]->importOutputFiles(*(job->getBatchJob()), job->getResultDirectory());
244 catch(const Batch::GenericException &ex)
246 LAUNCHER_INFOS("getJobResult is maybe incomplete, exception: " << ex.message);
247 throw LauncherException(ex.message.c_str());
249 LAUNCHER_MESSAGE("getJobResult ended");
252 //=============================================================================
254 * Clear the remote working directory
256 //=============================================================================
258 Launcher_cpp::clearJobWorkingDir(int job_id)
260 LAUNCHER_MESSAGE("Clear the remote working directory");
262 // Check if job exist
263 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(job_id);
264 if (it_job == _launcher_job_map.end())
266 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
267 throw LauncherException("Cannot find the job, is it created ?");
270 Launcher::Job * job = it_job->second;
273 _batchmap[job_id]->clearWorkingDir(*(job->getBatchJob()));
275 catch(const Batch::GenericException &ex)
277 LAUNCHER_INFOS("getJobResult is maybe incomplete, exception: " << ex.message);
278 throw LauncherException(ex.message.c_str());
280 LAUNCHER_MESSAGE("getJobResult ended");
283 //=============================================================================
285 * Get Job dump state - the result directory could be changed
287 //=============================================================================
289 Launcher_cpp::getJobDumpState(int job_id, std::string directory)
292 LAUNCHER_MESSAGE("Get Job dump state");
294 // Check if job exist
295 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(job_id);
296 if (it_job == _launcher_job_map.end())
298 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
299 throw LauncherException("Cannot find the job, is it created ?");
302 Launcher::Job * job = it_job->second;
303 std::string resource_name = job->getResourceDefinition().Name;
307 rtn = _batchmap[job_id]->importDumpStateFile(*(job->getBatchJob()), directory);
309 rtn = _batchmap[job_id]->importDumpStateFile(*(job->getBatchJob()), job->getResultDirectory());
311 catch(const Batch::GenericException &ex)
313 LAUNCHER_INFOS("getJobResult is maybe incomplete, exception: " << ex.message);
314 throw LauncherException(ex.message.c_str());
316 LAUNCHER_MESSAGE("getJobResult ended");
320 //=============================================================================
322 * Get one file from the working directory - the result directory can be changed
324 //=============================================================================
326 Launcher_cpp::getJobWorkFile(int job_id,
327 std::string work_file,
328 std::string directory)
331 LAUNCHER_MESSAGE("Get working file " << work_file);
333 // Check if job exist
334 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(job_id);
335 if (it_job == _launcher_job_map.end())
337 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
338 throw LauncherException("Cannot find the job, is it created ?");
341 Launcher::Job * job = it_job->second;
342 std::string resource_name = job->getResourceDefinition().Name;
346 rtn = _batchmap[job_id]->importWorkFile(*(job->getBatchJob()), work_file, directory);
348 rtn = _batchmap[job_id]->importWorkFile(*(job->getBatchJob()), work_file, job->getResultDirectory());
350 catch(const Batch::GenericException &ex)
352 LAUNCHER_INFOS("getJobWorkFile is maybe incomplete, exception: " << ex.message);
353 throw LauncherException(ex.message.c_str());
355 LAUNCHER_MESSAGE("getJobWorkFile ended");
359 //=============================================================================
361 * Remove the job - into the Launcher and its batch manager
363 //=============================================================================
365 Launcher_cpp::removeJob(int job_id)
367 LAUNCHER_MESSAGE("Remove Job");
369 // Check if job exist
370 std::map<int, Launcher::Job *>::iterator it_job = _launcher_job_map.find(job_id);
371 if (it_job == _launcher_job_map.end())
373 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
374 throw LauncherException("Cannot find the job, is it created ?");
377 it_job->second->removeJob();
378 delete it_job->second;
379 _launcher_job_map.erase(it_job);
382 //=============================================================================
386 //=============================================================================
388 Launcher_cpp::stopJob(int job_id)
390 LAUNCHER_MESSAGE("Stop Job");
392 // Check if job exist
393 std::map<int, Launcher::Job *>::iterator it_job = _launcher_job_map.find(job_id);
394 if (it_job == _launcher_job_map.end())
396 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
397 throw LauncherException("Cannot find the job, is it created ?");
400 it_job->second->stopJob();
403 //=============================================================================
405 * create a launcher job based on a file
406 * \param xmlExecuteFile : to define the execution on the batch cluster
408 //=============================================================================
410 Launcher_cpp::createJobWithFile(const std::string xmlExecuteFile,
411 const std::string clusterName)
413 LAUNCHER_MESSAGE("Begin of Launcher_cpp::createJobWithFile");
416 ParserLauncherType job_params = ParseXmlFile(xmlExecuteFile);
418 // Creating a new job
419 Launcher::Job_Command * new_job = new Launcher::Job_Command();
421 std::string cmdFile = Kernel_Utils::GetTmpFileName();
428 os.open(cmdFile.c_str(), std::ofstream::out );
429 os << "#! /bin/sh" << std::endl;
430 os << job_params.Command;
433 new_job->setJobFile(cmdFile);
434 new_job->setLocalDirectory(job_params.RefDirectory);
435 new_job->setWorkDirectory(job_params.MachinesList[clusterName].WorkDirectory);
436 new_job->setEnvFile(job_params.MachinesList[clusterName].EnvFile);
438 for(int i=0; i < job_params.InputFile.size(); i++)
439 new_job->add_in_file(job_params.InputFile[i]);
440 for(int i=0; i < job_params.OutputFile.size();i++)
441 new_job->add_out_file(job_params.OutputFile[i]);
444 p.hostname = clusterName;
447 p.nb_proc = job_params.NbOfProcesses;
449 p.nb_proc_per_node = 0;
452 new_job->setResourceRequiredParams(p);
455 return new_job->getNumber();
458 //=============================================================================
460 * Factory to instantiate the good batch manager for chosen cluster.
462 //=============================================================================
463 Batch::BatchManager *
464 Launcher_cpp::FactoryBatchManager(ParserResourcesType& params)
467 Batch::CommunicationProtocolType protocol;
468 Batch::FactBatchManager * fact;
470 std::string hostname = params.HostName;
472 switch(params.Protocol)
475 protocol = Batch::SH;
478 protocol = Batch::RSH;
481 protocol = Batch::SSH;
484 protocol = Batch::RSYNC;
487 throw LauncherException("Unknown protocol for this resource");
519 switch( params.Batch )
552 LAUNCHER_MESSAGE("Bad batch description of the resource: Batch = " << params.Batch);
553 throw LauncherException("No batchmanager for that cluster - Bad batch description of the resource");
555 Batch::BatchManagerCatalog & cata = Batch::BatchManagerCatalog::getInstance();
556 fact = dynamic_cast<Batch::FactBatchManager*>(cata(bmType));
558 LAUNCHER_MESSAGE("Cannot find batch manager factory for " << bmType << ". Check your version of libBatch.");
559 throw LauncherException("Cannot find batch manager factory");
561 LAUNCHER_MESSAGE("Instantiation of batch manager of type: " << bmType);
562 Batch::BatchManager * batch_client = (*fact)(hostname.c_str(), params.UserName.c_str(),
563 protocol, mpi.c_str());
567 //----------------------------------------------------------
568 // Without LIBBATCH - Launcher_cpp do nothing...
569 //----------------------------------------------------------
573 Launcher_cpp::createJob(Launcher::Job * new_job)
575 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot create a job !!!");
577 throw LauncherException("Method Launcher_cpp::createJob is not available "
578 "(libBatch was not present at compilation time)");
582 Launcher_cpp::launchJob(int job_id)
584 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot launch a job !!!");
585 throw LauncherException("Method Launcher_cpp::launchJob is not available "
586 "(libBatch was not present at compilation time)");
590 Launcher_cpp::getJobState(int job_id)
592 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job state!!!");
593 throw LauncherException("Method Launcher_cpp::getJobState is not available "
594 "(libBatch was not present at compilation time)");
598 Launcher_cpp::getAssignedHostnames(int job_id)
600 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job assigned hostnames!!!");
601 throw LauncherException("Method Launcher_cpp::getAssignedHostnames is not available "
602 "(libBatch was not present at compilation time)");
606 Launcher_cpp::getJobResults(int job_id, std::string directory)
608 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job results!!!");
609 throw LauncherException("Method Launcher_cpp::getJobResults is not available "
610 "(libBatch was not present at compilation time)");
614 Launcher_cpp::clearJobWorkingDir(int job_id)
616 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot clear directory!!!");
617 throw LauncherException("Method Launcher_cpp::clearJobWorkingDir is not available "
618 "(libBatch was not present at compilation time)");
622 Launcher_cpp::getJobDumpState(int job_id, std::string directory)
624 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job dump state!!!");
625 throw LauncherException("Method Launcher_cpp::getJobDumpState is not available "
626 "(libBatch was not present at compilation time)");
630 Launcher_cpp::getJobWorkFile(int job_id, std::string work_file, std::string directory)
632 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job dump state!!!");
633 throw LauncherException("Method Launcher_cpp::getJobWorkFile is not available "
634 "(libBatch was not present at compilation time)");
638 Launcher_cpp::removeJob(int job_id)
640 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot remove job!!!");
641 throw LauncherException("Method Launcher_cpp::removeJob is not available "
642 "(libBatch was not present at compilation time)");
646 Launcher_cpp::stopJob(int job_id)
648 throw LauncherException("Method Launcher_cpp::stopJob is not available "
649 "(libBatch was not present at compilation time)");
653 Launcher_cpp::createJobWithFile( const std::string xmlExecuteFile, std::string clusterName)
655 throw LauncherException("Method Launcher_cpp::createJobWithFile is not available "
656 "(libBatch was not present at compilation time)");
663 Launcher_cpp::ParseXmlFile(std::string xmlExecuteFile)
665 ParserLauncherType job_params;
666 SALOME_Launcher_Handler * handler = new SALOME_Launcher_Handler(job_params);
668 const char* aFilePath = xmlExecuteFile.c_str();
669 FILE* aFile = fopen(aFilePath, "r");
672 xmlDocPtr aDoc = xmlReadFile(aFilePath, NULL, 0);
674 handler->ProcessXmlDocument(aDoc);
677 std::string message = "ResourcesManager_cpp: could not parse file: " + xmlExecuteFile;
678 LAUNCHER_MESSAGE(message);
680 throw LauncherException(message);
688 std::string message = "ResourcesManager_cpp: file is not readable: " + xmlExecuteFile;
689 LAUNCHER_MESSAGE(message);
691 throw LauncherException(message);
699 std::map<int, Launcher::Job *>
700 Launcher_cpp::getJobs()
702 return _launcher_job_map;
706 Launcher_cpp::createBatchManagerForJob(Launcher::Job * job)
708 int job_id = job->getNumber();
710 // Select a resource for the job
711 std::vector<std::string> ResourceList;
712 resourceParams params = job->getResourceRequiredParams();
713 // Consider only resources that can launch batch jobs
714 params.can_launch_batch_jobs = true;
717 ResourceList = _ResManager->GetFittingResources(params);
719 catch(const ResourcesException &ex)
721 throw LauncherException(ex.msg.c_str());
723 if (ResourceList.size() == 0)
725 LAUNCHER_INFOS("No adequate resource found for the job, number " << job->getNumber());
726 job->setState("ERROR");
727 throw LauncherException("No resource found the job");
730 // Configure the job with the resource selected - the first of the list
731 ParserResourcesType resource_definition = _ResManager->GetResourcesDescr(ResourceList[0]);
733 // Set resource definition to the job
734 // The job will check if the definitions needed
737 job->setResourceDefinition(resource_definition);
739 catch(const LauncherException &ex)
741 LAUNCHER_INFOS("Error in the definition of the resource, mess: " << ex.msg);
742 job->setState("ERROR");
746 // Step 2: We can now add a Factory if the resource is correctly define
748 std::map<int, Batch::BatchManager *>::const_iterator it = _batchmap.find(job_id);
749 if(it == _batchmap.end())
753 // Warning cannot write on one line like this, because map object is constructed before
754 // the method is called...
755 //_batchmap[job_id] = FactoryBatchManager(resource_definition);
756 Batch::BatchManager * batch_client = FactoryBatchManager(resource_definition);
757 _batchmap[job_id] = batch_client;
759 catch(const LauncherException &ex)
761 LAUNCHER_INFOS("Error during creation of the batch manager of the job, mess: " << ex.msg);
764 catch(const Batch::GenericException &ex)
766 LAUNCHER_INFOS("Error during creation of the batch manager of the job, mess: " << ex.message);
767 throw LauncherException(ex.message);
774 Launcher_cpp::addJobDirectlyToMap(Launcher::Job * new_job)
776 // Step 0: Calculated job_id
777 pthread_mutex_lock(_job_cpt_mutex);
778 int job_id = _job_cpt;
780 new_job->setNumber(job_id);
781 pthread_mutex_unlock(_job_cpt_mutex);
783 // Step 1: check if resource is already in the map
784 createBatchManagerForJob(new_job);
786 // Step 2: add the job to the batch manager
790 Batch::JobId batch_manager_job_id = _batchmap[job_id]->addJob(*(new_job->getBatchJob()),
791 new_job->getReference());
792 new_job->setBatchManagerJobId(batch_manager_job_id);
794 catch(const Batch::GenericException &ex)
796 LAUNCHER_INFOS("Job cannot be added, exception in addJob: " << ex.message);
797 throw LauncherException(ex.message.c_str());
800 // Step 3: add job to launcher map
801 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(new_job->getNumber());
802 if (it_job == _launcher_job_map.end())
803 _launcher_job_map[new_job->getNumber()] = new_job;
806 LAUNCHER_INFOS("A job as already the same id: " << new_job->getNumber());
808 throw LauncherException("A job as already the same id - job is not created !");
810 LAUNCHER_MESSAGE("New job added");
815 Launcher_cpp::loadJobs(const char* jobs_file)
817 list<int> new_jobs_id_list;
819 // Load the jobs from XML file
820 list<Launcher::Job *> jobs_list = Launcher::XML_Persistence::loadJobs(jobs_file);
822 // Create each job in the launcher
823 list<Launcher::Job *>::const_iterator it_job;
824 for (it_job = jobs_list.begin(); it_job != jobs_list.end(); it_job++)
826 Launcher::Job * new_job = *it_job;
827 string job_state = new_job->getState();
831 if (job_state == "CREATED")
833 // In this case, we ignore run_part information
835 new_jobs_id_list.push_back(new_job->getNumber());
837 else if (job_state == "QUEUED" ||
838 job_state == "RUNNING" ||
839 job_state == "IN_PROCESS" ||
840 job_state == "PAUSED")
842 addJobDirectlyToMap(new_job);
843 new_jobs_id_list.push_back(new_job->getNumber());
845 // Step 4: We check that the BatchManager could resume
848 if (new_job->getBatchManagerJobId().getReference() != new_job->getReference())
850 LAUNCHER_INFOS("BatchManager type cannot resume a job - job state is set to ERROR");
851 new_job->setState("ERROR");
855 else if (job_state == "FINISHED" ||
856 job_state == "FAILED" ||
857 job_state == "ERROR")
859 // Step 2: We add run_part information
860 addJobDirectlyToMap(new_job);
861 new_jobs_id_list.push_back(new_job->getNumber());
865 LAUNCHER_INFOS("A bad job is found, state unknown " << job_state);
869 catch(const LauncherException &ex)
871 LAUNCHER_INFOS("Cannot load the job. Exception: " << ex.msg.c_str());
876 return new_jobs_id_list;
880 Launcher_cpp::saveJobs(const char* jobs_file)
882 // Create a sorted list from the internal job map
883 list<const Launcher::Job *> jobs_list;
884 for (int i=0; i<_job_cpt; i++)
886 map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(i);
887 if (it_job != _launcher_job_map.end())
888 jobs_list.push_back(it_job->second);
891 // Save the jobs in XML file
892 Launcher::XML_Persistence::saveJobs(jobs_file, jobs_list);