1 // Copyright (C) 2007-2019 CEA/DEN, EDF R&D, OPEN CASCADE
3 // Copyright (C) 2003-2007 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
4 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU Lesser General Public
8 // License as published by the Free Software Foundation; either
9 // version 2.1 of the License, or (at your option) any later version.
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // Lesser General Public License for more details.
16 // You should have received a copy of the GNU Lesser General Public
17 // License along with this library; if not, write to the Free Software
18 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
31 #include <libbatch/BatchManagerCatalog.hxx>
32 #include <libbatch/FactBatchManager.hxx>
33 #include <libbatch/BatchManager.hxx>
36 #include "Basics_Utils.hxx"
37 #include "Basics_DirUtils.hxx"
38 #include "SALOME_Launcher_Handler.hxx"
39 #include "Launcher.hxx"
40 #include "Launcher_Job_Command.hxx"
41 #include "Launcher_XML_Persistence.hxx"
45 //=============================================================================
49 * Define a CORBA single thread policy for the server, which avoid to deal
50 * with non thread-safe usage like Change_Directory in SALOME naming service
52 //=============================================================================
53 Launcher_cpp::Launcher_cpp()
55 LAUNCHER_MESSAGE("Launcher_cpp constructor");
59 //=============================================================================
63 //=============================================================================
64 Launcher_cpp::~Launcher_cpp()
66 LAUNCHER_MESSAGE("Launcher_cpp destructor");
68 std::map<int, Launcher::Job *>::const_iterator it_job;
69 for(it_job = _launcher_job_map.begin(); it_job != _launcher_job_map.end(); it_job++)
70 it_job->second->decrRef();
71 std::map <int, Batch::BatchManager * >::const_iterator it1;
72 for(it1=_batchmap.begin();it1!=_batchmap.end();it1++)
79 //=============================================================================
81 * Add a job into the launcher - check resource and choose one
83 //=============================================================================
85 Launcher_cpp::createJob(Launcher::Job * new_job)
87 LAUNCHER_MESSAGE("Creating a new job");
88 // Add job to the jobs map
89 new_job->setNumber(_job_cpt);
91 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(new_job->getNumber());
92 if (it_job == _launcher_job_map.end())
94 _launcher_job_map[new_job->getNumber()] = new_job;
99 LAUNCHER_INFOS("A job has already the same id: " << new_job->getNumber());
100 throw LauncherException("A job has already the same id - job is not created !");
102 LAUNCHER_MESSAGE("New Job created");
105 //=============================================================================
109 //=============================================================================
111 Launcher_cpp::launchJob(int job_id)
113 LAUNCHER_MESSAGE("Launch a job");
115 // Check if job exists
116 Launcher::Job * job = findJob(job_id);
118 // Check job state (cannot launch a job already launched...)
119 if (job->getState() != "CREATED")
121 LAUNCHER_INFOS("Bad state of the job: " << job->getState());
122 throw LauncherException("Bad state of the job: " + job->getState());
125 Batch::BatchManager * bm = getBatchManager(job);
128 Batch::JobId batch_manager_job_id = bm->submitJob(*(job->getBatchJob()));
129 job->setBatchManagerJobId(batch_manager_job_id);
130 job->setState("QUEUED");
131 job->setReference(batch_manager_job_id.getReference());
133 catch(const Batch::GenericException &ex)
135 LAUNCHER_INFOS("Job is not launched, exception in submitJob: " << ex.message);
136 throw LauncherException(ex.message.c_str());
138 LAUNCHER_MESSAGE("Job launched");
141 //=============================================================================
145 //=============================================================================
147 Launcher_cpp::getJobState(int job_id)
149 LAUNCHER_MESSAGE("Get job state");
151 // Check if job exist
152 Launcher::Job * job = findJob(job_id);
157 state = job->updateJobState();
159 catch(const Batch::GenericException &ex)
161 LAUNCHER_INFOS("getJobState failed, exception: " << ex.message);
162 throw LauncherException(ex.message.c_str());
165 return state.c_str();
168 //=============================================================================
170 * Get job assigned hostnames
172 //=============================================================================
174 Launcher_cpp::getAssignedHostnames(int job_id)
176 LAUNCHER_MESSAGE("Get job assigned hostnames");
178 // Check if job exist
179 Launcher::Job * job = findJob(job_id);
180 std::string assigned_hostnames = job->getAssignedHostnames();
182 return assigned_hostnames.c_str();
185 //=============================================================================
187 * Get Job result - the result directory could be changed
189 //=============================================================================
191 Launcher_cpp::getJobResults(int job_id, std::string directory)
193 LAUNCHER_MESSAGE("Get Job results");
195 Launcher::Job * job = findJob(job_id);
196 std::string resource_name = job->getResourceDefinition().Name;
200 _batchmap[job_id]->importOutputFiles(*(job->getBatchJob()), directory);
202 _batchmap[job_id]->importOutputFiles(*(job->getBatchJob()), job->getResultDirectory());
204 catch(const Batch::GenericException &ex)
206 LAUNCHER_INFOS("getJobResult is maybe incomplete, exception: " << ex.message);
207 throw LauncherException(ex.message.c_str());
209 LAUNCHER_MESSAGE("getJobResult ended");
212 //=============================================================================
214 * Clear the remote working directory
216 //=============================================================================
218 Launcher_cpp::clearJobWorkingDir(int job_id)
220 LAUNCHER_MESSAGE("Clear the remote working directory");
222 Launcher::Job * job = findJob(job_id);
225 _batchmap[job_id]->clearWorkingDir(*(job->getBatchJob()));
227 catch(const Batch::GenericException &ex)
229 LAUNCHER_INFOS("getJobResult is maybe incomplete, exception: " << ex.message);
230 throw LauncherException(ex.message.c_str());
232 LAUNCHER_MESSAGE("getJobResult ended");
235 //=============================================================================
237 * Get Job dump state - the result directory could be changed
239 //=============================================================================
241 Launcher_cpp::getJobDumpState(int job_id, std::string directory)
244 LAUNCHER_MESSAGE("Get Job dump state");
246 Launcher::Job * job = findJob(job_id);
247 std::string resource_name = job->getResourceDefinition().Name;
251 rtn = _batchmap[job_id]->importDumpStateFile(*(job->getBatchJob()), directory);
253 rtn = _batchmap[job_id]->importDumpStateFile(*(job->getBatchJob()), job->getResultDirectory());
255 catch(const Batch::GenericException &ex)
257 LAUNCHER_INFOS("getJobResult is maybe incomplete, exception: " << ex.message);
258 throw LauncherException(ex.message.c_str());
260 LAUNCHER_MESSAGE("getJobResult ended");
264 //=============================================================================
266 * Get one file from the working directory - the result directory can be changed
268 //=============================================================================
270 Launcher_cpp::getJobWorkFile(int job_id,
271 std::string work_file,
272 std::string directory)
275 LAUNCHER_MESSAGE("Get working file " << work_file);
277 Launcher::Job * job = findJob(job_id);
278 std::string resource_name = job->getResourceDefinition().Name;
282 rtn = _batchmap[job_id]->importWorkFile(*(job->getBatchJob()), work_file, directory);
284 rtn = _batchmap[job_id]->importWorkFile(*(job->getBatchJob()), work_file, job->getResultDirectory());
286 catch(const Batch::GenericException &ex)
288 LAUNCHER_INFOS("getJobWorkFile is maybe incomplete, exception: " << ex.message);
289 throw LauncherException(ex.message.c_str());
291 LAUNCHER_MESSAGE("getJobWorkFile ended");
295 //=============================================================================
297 * Remove the job - into the Launcher and its batch manager
299 //=============================================================================
301 Launcher_cpp::removeJob(int job_id)
303 LAUNCHER_MESSAGE("Remove Job");
305 // Check if job exist
306 std::map<int, Launcher::Job *>::iterator it_job = _launcher_job_map.find(job_id);
307 if (it_job == _launcher_job_map.end())
309 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
310 throw LauncherException("Cannot find the job, is it created ?");
313 it_job->second->removeJob();
314 it_job->second->decrRef();
315 _launcher_job_map.erase(it_job);
318 //=============================================================================
322 //=============================================================================
324 Launcher_cpp::stopJob(int job_id)
326 LAUNCHER_MESSAGE("Stop Job");
328 Launcher::Job * job = findJob(job_id);
333 Launcher_cpp::dumpJob(int job_id)
335 LAUNCHER_MESSAGE("dump Job");
337 Launcher::Job * job = findJob(job_id);
338 return Launcher::XML_Persistence::dumpJob(*job);
342 Launcher_cpp::restoreJob(const std::string& dumpedJob)
344 LAUNCHER_MESSAGE("restore Job");
345 auto JobDel = [] (Launcher::Job *job) { if(job) job->decrRef(); };
346 std::unique_ptr<Launcher::Job, decltype(JobDel)> new_job(nullptr,JobDel);
351 new_job.reset(Launcher::XML_Persistence::createJobFromString(dumpedJob));
355 jobId = addJob(new_job.get());
358 catch(const LauncherException &ex)
360 LAUNCHER_INFOS("Cannot load the job. Exception: " << ex.msg.c_str());
365 //=============================================================================
367 * create a launcher job based on a file
368 * \param xmlExecuteFile : to define the execution on the batch cluster
370 //=============================================================================
372 Launcher_cpp::createJobWithFile(const std::string xmlExecuteFile,
373 const std::string clusterName)
375 LAUNCHER_MESSAGE("Begin of Launcher_cpp::createJobWithFile");
378 ParserLauncherType job_params = ParseXmlFile(xmlExecuteFile);
380 // Creating a new job
381 auto JobDel = [] (Launcher::Job *job) { if(job) job->decrRef(); };
382 std::unique_ptr<Launcher::Job_Command, decltype(JobDel)> new_job(new Launcher::Job_Command,JobDel);
384 std::string cmdFile = Kernel_Utils::GetTmpFileName();
391 os.open(cmdFile.c_str(), std::ofstream::out );
392 os << "#! /bin/sh" << std::endl;
393 os << job_params.Command;
396 new_job->setJobFile(cmdFile);
397 new_job->setLocalDirectory(job_params.RefDirectory);
398 new_job->setWorkDirectory(job_params.MachinesList[clusterName].WorkDirectory);
399 new_job->setEnvFile(job_params.MachinesList[clusterName].EnvFile);
401 for(int i=0; i < job_params.InputFile.size(); i++)
402 new_job->add_in_file(job_params.InputFile[i]);
403 for(int i=0; i < job_params.OutputFile.size();i++)
404 new_job->add_out_file(job_params.OutputFile[i]);
407 p.hostname = clusterName;
410 p.nb_proc = job_params.NbOfProcesses;
412 p.nb_proc_per_node = 0;
415 new_job->setResourceRequiredParams(p);
417 createJob(new_job.get());
418 return new_job->getNumber();
421 //=============================================================================
423 * Factory to instantiate the good batch manager for chosen cluster.
425 //=============================================================================
426 Batch::BatchManager *
427 Launcher_cpp::FactoryBatchManager(ParserResourcesType& params)
430 Batch::CommunicationProtocolType protocol;
431 Batch::FactBatchManager * fact;
433 std::string hostname = params.HostName;
435 switch(params.Protocol)
438 protocol = Batch::SH;
441 protocol = Batch::RSH;
444 protocol = Batch::SSH;
447 protocol = Batch::RSYNC;
450 throw LauncherException("Unknown protocol for this resource");
482 switch( params.Batch )
515 LAUNCHER_MESSAGE("Bad batch description of the resource: Batch = " << params.Batch);
516 throw LauncherException("No batchmanager for that cluster - Bad batch description of the resource");
518 Batch::BatchManagerCatalog & cata = Batch::BatchManagerCatalog::getInstance();
519 fact = dynamic_cast<Batch::FactBatchManager*>(cata(bmType));
521 LAUNCHER_MESSAGE("Cannot find batch manager factory for " << bmType << ". Check your version of libBatch.");
522 throw LauncherException("Cannot find batch manager factory");
524 LAUNCHER_MESSAGE("Instantiation of batch manager of type: " << bmType);
525 Batch::BatchManager * batch_client = (*fact)(hostname.c_str(), params.UserName.c_str(),
526 protocol, mpi.c_str());
530 //----------------------------------------------------------
531 // Without LIBBATCH - Launcher_cpp do nothing...
532 //----------------------------------------------------------
536 Launcher_cpp::createJob(Launcher::Job * new_job)
538 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot create a job !!!");
539 throw LauncherException("Method Launcher_cpp::createJob is not available "
540 "(libBatch was not present at compilation time)");
544 Launcher_cpp::launchJob(int job_id)
546 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot launch a job !!!");
547 throw LauncherException("Method Launcher_cpp::launchJob is not available "
548 "(libBatch was not present at compilation time)");
552 Launcher_cpp::getJobState(int job_id)
554 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job state!!!");
555 throw LauncherException("Method Launcher_cpp::getJobState is not available "
556 "(libBatch was not present at compilation time)");
560 Launcher_cpp::getAssignedHostnames(int job_id)
562 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job assigned hostnames!!!");
563 throw LauncherException("Method Launcher_cpp::getAssignedHostnames is not available "
564 "(libBatch was not present at compilation time)");
568 Launcher_cpp::getJobResults(int job_id, std::string directory)
570 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job results!!!");
571 throw LauncherException("Method Launcher_cpp::getJobResults is not available "
572 "(libBatch was not present at compilation time)");
576 Launcher_cpp::clearJobWorkingDir(int job_id)
578 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot clear directory!!!");
579 throw LauncherException("Method Launcher_cpp::clearJobWorkingDir is not available "
580 "(libBatch was not present at compilation time)");
584 Launcher_cpp::getJobDumpState(int job_id, std::string directory)
586 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job dump state!!!");
587 throw LauncherException("Method Launcher_cpp::getJobDumpState is not available "
588 "(libBatch was not present at compilation time)");
592 Launcher_cpp::getJobWorkFile(int job_id, std::string work_file, std::string directory)
594 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job dump state!!!");
595 throw LauncherException("Method Launcher_cpp::getJobWorkFile is not available "
596 "(libBatch was not present at compilation time)");
600 Launcher_cpp::removeJob(int job_id)
602 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot remove job!!!");
603 throw LauncherException("Method Launcher_cpp::removeJob is not available "
604 "(libBatch was not present at compilation time)");
608 Launcher_cpp::stopJob(int job_id)
610 throw LauncherException("Method Launcher_cpp::stopJob is not available "
611 "(libBatch was not present at compilation time)");
615 Launcher_cpp::dumpJob(int job_id)
617 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot dump job!!!");
618 throw LauncherException("Method Launcher_cpp::dumpJob is not available "
619 "(libBatch was not present at compilation time)");
624 Launcher_cpp::restoreJob(const std::string& dumpedJob)
626 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot restore job!!!");
627 throw LauncherException("Method Launcher_cpp::restoreJob is not available "
628 "(libBatch was not present at compilation time)");
633 Launcher_cpp::createJobWithFile( const std::string xmlExecuteFile, std::string clusterName)
635 throw LauncherException("Method Launcher_cpp::createJobWithFile is not available "
636 "(libBatch was not present at compilation time)");
643 Launcher_cpp::ParseXmlFile(std::string xmlExecuteFile)
645 ParserLauncherType job_params;
646 SALOME_Launcher_Handler * handler = new SALOME_Launcher_Handler(job_params);
648 const char* aFilePath = xmlExecuteFile.c_str();
649 FILE* aFile = fopen(aFilePath, "r");
652 xmlDocPtr aDoc = xmlReadFile(aFilePath, NULL, 0);
654 handler->ProcessXmlDocument(aDoc);
657 std::string message = "ResourcesManager_cpp: could not parse file: " + xmlExecuteFile;
658 LAUNCHER_MESSAGE(message);
660 throw LauncherException(message);
668 std::string message = "ResourcesManager_cpp: file is not readable: " + xmlExecuteFile;
669 LAUNCHER_MESSAGE(message);
671 throw LauncherException(message);
679 std::map<int, Launcher::Job *>
680 Launcher_cpp::getJobs()
682 return _launcher_job_map;
687 Launcher_cpp::getBatchManager(Launcher::Job * job)
689 Batch::BatchManager* result = nullptr;
690 int job_id = job->getNumber();
692 // Select a resource for the job
693 std::vector<std::string> ResourceList;
694 resourceParams params = job->getResourceRequiredParams();
695 // Consider only resources that can launch batch jobs
696 params.can_launch_batch_jobs = true;
699 ResourceList = _ResManager->GetFittingResources(params);
701 catch(const ResourcesException &ex)
703 throw LauncherException(ex.msg.c_str());
705 if (ResourceList.size() == 0)
707 LAUNCHER_INFOS("No adequate resource found for the job, number " << job->getNumber());
708 job->setState("ERROR");
709 throw LauncherException("No resource found the job");
712 // Configure the job with the resource selected - the first of the list
713 ParserResourcesType resource_definition = _ResManager->GetResourcesDescr(ResourceList[0]);
715 // Set resource definition to the job
716 // The job will check if the definitions needed
719 job->setResourceDefinition(resource_definition);
721 catch(const LauncherException &ex)
723 LAUNCHER_INFOS("Error in the definition of the resource, mess: " << ex.msg);
724 job->setState("ERROR");
728 // Step 2: We can now add a Factory if the resource is correctly define
729 std::map<int, Batch::BatchManager *>::const_iterator it = _batchmap.find(job_id);
730 if(it == _batchmap.end())
734 // Warning cannot write on one line like this, because map object is constructed before
735 // the method is called...
736 //_batchmap[job_id] = FactoryBatchManager(resource_definition);
737 result = FactoryBatchManager(resource_definition);
738 _batchmap[job_id] = result;
740 catch(const LauncherException &ex)
742 LAUNCHER_INFOS("Error during creation of the batch manager of the job, mess: " << ex.msg);
745 catch(const Batch::GenericException &ex)
747 LAUNCHER_INFOS("Error during creation of the batch manager of the job, mess: " << ex.message);
748 throw LauncherException(ex.message);
758 Launcher_cpp::addJobDirectlyToMap(Launcher::Job * new_job)
760 // Step 0: Calculated job_id
761 new_job->setNumber(_job_cpt);
765 // Step 1: check if resource is already in the map
766 Batch::BatchManager * bm = getBatchManager(new_job);
768 // Step 2: add the job to the batch manager
771 Batch::JobId batch_manager_job_id = bm->addJob(*(new_job->getBatchJob()),
772 new_job->getReference());
773 new_job->setBatchManagerJobId(batch_manager_job_id);
775 catch(const Batch::GenericException &ex)
777 LAUNCHER_INFOS("Job cannot be added, exception in addJob: " << ex.message);
778 throw LauncherException(ex.message.c_str());
781 // Step 3: add job to launcher map
782 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(new_job->getNumber());
783 if (it_job == _launcher_job_map.end())
785 _launcher_job_map[new_job->getNumber()] = new_job;
790 LAUNCHER_INFOS("A job as already the same id: " << new_job->getNumber());
791 throw LauncherException("A job as already the same id - job is not created !");
793 LAUNCHER_MESSAGE("New job added");
798 Launcher_cpp::addJob(Launcher::Job * new_job)
800 string job_state = new_job->getState();
802 if (job_state == "CREATED")
804 // In this case, we ignore run_part information
806 jobId = new_job->getNumber();
808 else if (job_state == "QUEUED" ||
809 job_state == "RUNNING" ||
810 job_state == "IN_PROCESS" ||
811 job_state == "PAUSED")
813 addJobDirectlyToMap(new_job);
814 jobId = new_job->getNumber();
816 // We check that the BatchManager could resume the job
818 if (new_job->getBatchManagerJobId().getReference() != new_job->getReference())
820 LAUNCHER_INFOS("BatchManager type cannot resume a job - job state is set to ERROR");
821 new_job->setState("ERROR");
825 else if (job_state == "FINISHED" ||
826 job_state == "FAILED" ||
827 job_state == "ERROR")
829 // We add run_part information
830 addJobDirectlyToMap(new_job);
831 jobId = new_job->getNumber();
835 LAUNCHER_INFOS("A bad job is found, state unknown " << job_state);
842 Launcher_cpp::loadJobs(const char* jobs_file)
844 auto JobDel = [] (Launcher::Job *job) { if(job) job->decrRef(); };
846 list<int> new_jobs_id_list;
848 // Load the jobs from XML file
849 list<Launcher::Job *> jobs_list = Launcher::XML_Persistence::loadJobs(jobs_file);
851 // Create each job in the launcher
852 list<Launcher::Job *>::const_iterator it_job;
853 for (it_job = jobs_list.begin(); it_job != jobs_list.end(); it_job++)
855 std::unique_ptr<Launcher::Job, decltype(JobDel) > new_job(*it_job, JobDel);
859 jobId = addJob(new_job.get());
861 new_jobs_id_list.push_back(jobId);
863 catch(const LauncherException &ex)
865 LAUNCHER_INFOS("Cannot load the job. Exception: " << ex.msg.c_str());
869 return new_jobs_id_list;
873 Launcher_cpp::saveJobs(const char* jobs_file)
875 // Create a sorted list from the internal job map
876 list<const Launcher::Job *> jobs_list;
878 for (int i=0; i<_job_cpt; i++)
880 map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(i);
881 if (it_job != _launcher_job_map.end())
882 jobs_list.push_back(it_job->second);
885 // Save the jobs in XML file
886 Launcher::XML_Persistence::saveJobs(jobs_file, jobs_list);
890 Launcher_cpp::findJob(int job_id)
892 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(job_id);
893 if (it_job == _launcher_job_map.end())
895 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
896 throw LauncherException("Cannot find the job, is it created ?");
898 Launcher::Job * job = it_job->second;