1 // Copyright (C) 2007-2016 CEA/DEN, EDF R&D, OPEN CASCADE
3 // Copyright (C) 2003-2007 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
4 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU Lesser General Public
8 // License as published by the Free Software Foundation; either
9 // version 2.1 of the License, or (at your option) any later version.
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // Lesser General Public License for more details.
16 // You should have received a copy of the GNU Lesser General Public
17 // License along with this library; if not, write to the Free Software
18 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
30 #include <libbatch/BatchManagerCatalog.hxx>
31 #include <libbatch/FactBatchManager.hxx>
32 #include <libbatch/BatchManager.hxx>
35 #include "Basics_Utils.hxx"
36 #include "Basics_DirUtils.hxx"
37 #include "SALOME_Launcher_Handler.hxx"
38 #include "Launcher.hxx"
39 #include "Launcher_Job_Command.hxx"
40 #include "Launcher_XML_Persistence.hxx"
44 //=============================================================================
48 * Define a CORBA single thread policy for the server, which avoid to deal
49 * with non thread-safe usage like Change_Directory in SALOME naming service
51 //=============================================================================
52 Launcher_cpp::Launcher_cpp()
54 LAUNCHER_MESSAGE("Launcher_cpp constructor");
58 //=============================================================================
62 //=============================================================================
63 Launcher_cpp::~Launcher_cpp()
65 LAUNCHER_MESSAGE("Launcher_cpp destructor");
67 std::map<int, Launcher::Job *>::const_iterator it_job;
68 for(it_job = _launcher_job_map.begin(); it_job != _launcher_job_map.end(); it_job++)
69 delete it_job->second;
70 std::map <int, Batch::BatchManager * >::const_iterator it1;
71 for(it1=_batchmap.begin();it1!=_batchmap.end();it1++)
78 //=============================================================================
80 * Add a job into the launcher - check resource and choose one
82 //=============================================================================
84 Launcher_cpp::createJob(Launcher::Job * new_job)
86 LAUNCHER_MESSAGE("Creating a new job");
87 // Add job to the jobs map
88 new_job->setNumber(_job_cpt);
90 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(new_job->getNumber());
91 if (it_job == _launcher_job_map.end())
93 _launcher_job_map[new_job->getNumber()] = new_job;
97 LAUNCHER_INFOS("A job has already the same id: " << new_job->getNumber());
99 throw LauncherException("A job has already the same id - job is not created !");
101 LAUNCHER_MESSAGE("New Job created");
104 //=============================================================================
108 //=============================================================================
110 Launcher_cpp::launchJob(int job_id)
112 LAUNCHER_MESSAGE("Launch a job");
114 // Check if job exists
115 Launcher::Job * job = findJob(job_id);
117 // Check job state (cannot launch a job already launched...)
118 if (job->getState() != "CREATED")
120 LAUNCHER_INFOS("Bad state of the job: " << job->getState());
121 throw LauncherException("Bad state of the job: " + job->getState());
124 Batch::BatchManager * bm = getBatchManager(job);
127 Batch::JobId batch_manager_job_id = bm->submitJob(*(job->getBatchJob()));
128 job->setBatchManagerJobId(batch_manager_job_id);
129 job->setState("QUEUED");
130 job->setReference(batch_manager_job_id.getReference());
132 catch(const Batch::GenericException &ex)
134 LAUNCHER_INFOS("Job is not launched, exception in submitJob: " << ex.message);
135 throw LauncherException(ex.message.c_str());
137 LAUNCHER_MESSAGE("Job launched");
140 //=============================================================================
144 //=============================================================================
146 Launcher_cpp::getJobState(int job_id)
148 LAUNCHER_MESSAGE("Get job state");
150 // Check if job exist
151 Launcher::Job * job = findJob(job_id);
156 state = job->updateJobState();
158 catch(const Batch::GenericException &ex)
160 LAUNCHER_INFOS("getJobState failed, exception: " << ex.message);
161 throw LauncherException(ex.message.c_str());
164 return state.c_str();
167 //=============================================================================
169 * Get job assigned hostnames
171 //=============================================================================
173 Launcher_cpp::getAssignedHostnames(int job_id)
175 LAUNCHER_MESSAGE("Get job assigned hostnames");
177 // Check if job exist
178 Launcher::Job * job = findJob(job_id);
179 std::string assigned_hostnames = job->getAssignedHostnames();
181 return assigned_hostnames.c_str();
184 //=============================================================================
186 * Get Job result - the result directory could be changed
188 //=============================================================================
190 Launcher_cpp::getJobResults(int job_id, std::string directory)
192 LAUNCHER_MESSAGE("Get Job results");
194 Launcher::Job * job = findJob(job_id);
195 std::string resource_name = job->getResourceDefinition().Name;
199 _batchmap[job_id]->importOutputFiles(*(job->getBatchJob()), directory);
201 _batchmap[job_id]->importOutputFiles(*(job->getBatchJob()), job->getResultDirectory());
203 catch(const Batch::GenericException &ex)
205 LAUNCHER_INFOS("getJobResult is maybe incomplete, exception: " << ex.message);
206 throw LauncherException(ex.message.c_str());
208 LAUNCHER_MESSAGE("getJobResult ended");
211 //=============================================================================
213 * Clear the remote working directory
215 //=============================================================================
217 Launcher_cpp::clearJobWorkingDir(int job_id)
219 LAUNCHER_MESSAGE("Clear the remote working directory");
221 Launcher::Job * job = findJob(job_id);
224 _batchmap[job_id]->clearWorkingDir(*(job->getBatchJob()));
226 catch(const Batch::GenericException &ex)
228 LAUNCHER_INFOS("getJobResult is maybe incomplete, exception: " << ex.message);
229 throw LauncherException(ex.message.c_str());
231 LAUNCHER_MESSAGE("getJobResult ended");
234 //=============================================================================
236 * Get Job dump state - the result directory could be changed
238 //=============================================================================
240 Launcher_cpp::getJobDumpState(int job_id, std::string directory)
243 LAUNCHER_MESSAGE("Get Job dump state");
245 Launcher::Job * job = findJob(job_id);
246 std::string resource_name = job->getResourceDefinition().Name;
250 rtn = _batchmap[job_id]->importDumpStateFile(*(job->getBatchJob()), directory);
252 rtn = _batchmap[job_id]->importDumpStateFile(*(job->getBatchJob()), job->getResultDirectory());
254 catch(const Batch::GenericException &ex)
256 LAUNCHER_INFOS("getJobResult is maybe incomplete, exception: " << ex.message);
257 throw LauncherException(ex.message.c_str());
259 LAUNCHER_MESSAGE("getJobResult ended");
263 //=============================================================================
265 * Get one file from the working directory - the result directory can be changed
267 //=============================================================================
269 Launcher_cpp::getJobWorkFile(int job_id,
270 std::string work_file,
271 std::string directory)
274 LAUNCHER_MESSAGE("Get working file " << work_file);
276 Launcher::Job * job = findJob(job_id);
277 std::string resource_name = job->getResourceDefinition().Name;
281 rtn = _batchmap[job_id]->importWorkFile(*(job->getBatchJob()), work_file, directory);
283 rtn = _batchmap[job_id]->importWorkFile(*(job->getBatchJob()), work_file, job->getResultDirectory());
285 catch(const Batch::GenericException &ex)
287 LAUNCHER_INFOS("getJobWorkFile is maybe incomplete, exception: " << ex.message);
288 throw LauncherException(ex.message.c_str());
290 LAUNCHER_MESSAGE("getJobWorkFile ended");
294 //=============================================================================
296 * Remove the job - into the Launcher and its batch manager
298 //=============================================================================
300 Launcher_cpp::removeJob(int job_id)
302 LAUNCHER_MESSAGE("Remove Job");
304 // Check if job exist
305 std::map<int, Launcher::Job *>::iterator it_job = _launcher_job_map.find(job_id);
306 if (it_job == _launcher_job_map.end())
308 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
309 throw LauncherException("Cannot find the job, is it created ?");
312 it_job->second->removeJob();
313 delete it_job->second;
314 _launcher_job_map.erase(it_job);
317 //=============================================================================
321 //=============================================================================
323 Launcher_cpp::stopJob(int job_id)
325 LAUNCHER_MESSAGE("Stop Job");
327 Launcher::Job * job = findJob(job_id);
332 Launcher_cpp::dumpJob(int job_id)
334 LAUNCHER_MESSAGE("dump Job");
336 Launcher::Job * job = findJob(job_id);
337 return Launcher::XML_Persistence::dumpJob(*job);
341 Launcher_cpp::restoreJob(const std::string& dumpedJob)
343 LAUNCHER_MESSAGE("restore Job");
344 Launcher::Job * new_job=NULL;
349 new_job = Launcher::XML_Persistence::createJobFromString(dumpedJob);
353 jobId = addJob(new_job);
358 catch(const LauncherException &ex)
360 LAUNCHER_INFOS("Cannot load the job. Exception: " << ex.msg.c_str());
367 //=============================================================================
369 * create a launcher job based on a file
370 * \param xmlExecuteFile : to define the execution on the batch cluster
372 //=============================================================================
374 Launcher_cpp::createJobWithFile(const std::string xmlExecuteFile,
375 const std::string clusterName)
377 LAUNCHER_MESSAGE("Begin of Launcher_cpp::createJobWithFile");
380 ParserLauncherType job_params = ParseXmlFile(xmlExecuteFile);
382 // Creating a new job
383 Launcher::Job_Command * new_job = new Launcher::Job_Command();
385 std::string cmdFile = Kernel_Utils::GetTmpFileName();
392 os.open(cmdFile.c_str(), std::ofstream::out );
393 os << "#! /bin/sh" << std::endl;
394 os << job_params.Command;
397 new_job->setJobFile(cmdFile);
398 new_job->setLocalDirectory(job_params.RefDirectory);
399 new_job->setWorkDirectory(job_params.MachinesList[clusterName].WorkDirectory);
400 new_job->setEnvFile(job_params.MachinesList[clusterName].EnvFile);
402 for(int i=0; i < job_params.InputFile.size(); i++)
403 new_job->add_in_file(job_params.InputFile[i]);
404 for(int i=0; i < job_params.OutputFile.size();i++)
405 new_job->add_out_file(job_params.OutputFile[i]);
408 p.hostname = clusterName;
411 p.nb_proc = job_params.NbOfProcesses;
413 p.nb_proc_per_node = 0;
416 new_job->setResourceRequiredParams(p);
419 return new_job->getNumber();
422 //=============================================================================
424 * Factory to instantiate the good batch manager for chosen cluster.
426 //=============================================================================
427 Batch::BatchManager *
428 Launcher_cpp::FactoryBatchManager(ParserResourcesType& params)
431 Batch::CommunicationProtocolType protocol;
432 Batch::FactBatchManager * fact;
434 std::string hostname = params.HostName;
436 switch(params.Protocol)
439 protocol = Batch::SH;
442 protocol = Batch::RSH;
445 protocol = Batch::SSH;
448 protocol = Batch::RSYNC;
451 throw LauncherException("Unknown protocol for this resource");
483 switch( params.Batch )
516 LAUNCHER_MESSAGE("Bad batch description of the resource: Batch = " << params.Batch);
517 throw LauncherException("No batchmanager for that cluster - Bad batch description of the resource");
519 Batch::BatchManagerCatalog & cata = Batch::BatchManagerCatalog::getInstance();
520 fact = dynamic_cast<Batch::FactBatchManager*>(cata(bmType));
522 LAUNCHER_MESSAGE("Cannot find batch manager factory for " << bmType << ". Check your version of libBatch.");
523 throw LauncherException("Cannot find batch manager factory");
525 LAUNCHER_MESSAGE("Instantiation of batch manager of type: " << bmType);
526 Batch::BatchManager * batch_client = (*fact)(hostname.c_str(), params.UserName.c_str(),
527 protocol, mpi.c_str());
531 //----------------------------------------------------------
532 // Without LIBBATCH - Launcher_cpp do nothing...
533 //----------------------------------------------------------
537 Launcher_cpp::createJob(Launcher::Job * new_job)
539 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot create a job !!!");
541 throw LauncherException("Method Launcher_cpp::createJob is not available "
542 "(libBatch was not present at compilation time)");
546 Launcher_cpp::launchJob(int job_id)
548 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot launch a job !!!");
549 throw LauncherException("Method Launcher_cpp::launchJob is not available "
550 "(libBatch was not present at compilation time)");
554 Launcher_cpp::getJobState(int job_id)
556 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job state!!!");
557 throw LauncherException("Method Launcher_cpp::getJobState is not available "
558 "(libBatch was not present at compilation time)");
562 Launcher_cpp::getAssignedHostnames(int job_id)
564 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job assigned hostnames!!!");
565 throw LauncherException("Method Launcher_cpp::getAssignedHostnames is not available "
566 "(libBatch was not present at compilation time)");
570 Launcher_cpp::getJobResults(int job_id, std::string directory)
572 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job results!!!");
573 throw LauncherException("Method Launcher_cpp::getJobResults is not available "
574 "(libBatch was not present at compilation time)");
578 Launcher_cpp::clearJobWorkingDir(int job_id)
580 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot clear directory!!!");
581 throw LauncherException("Method Launcher_cpp::clearJobWorkingDir is not available "
582 "(libBatch was not present at compilation time)");
586 Launcher_cpp::getJobDumpState(int job_id, std::string directory)
588 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job dump state!!!");
589 throw LauncherException("Method Launcher_cpp::getJobDumpState is not available "
590 "(libBatch was not present at compilation time)");
594 Launcher_cpp::getJobWorkFile(int job_id, std::string work_file, std::string directory)
596 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot get job dump state!!!");
597 throw LauncherException("Method Launcher_cpp::getJobWorkFile is not available "
598 "(libBatch was not present at compilation time)");
602 Launcher_cpp::removeJob(int job_id)
604 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot remove job!!!");
605 throw LauncherException("Method Launcher_cpp::removeJob is not available "
606 "(libBatch was not present at compilation time)");
610 Launcher_cpp::stopJob(int job_id)
612 throw LauncherException("Method Launcher_cpp::stopJob is not available "
613 "(libBatch was not present at compilation time)");
617 Launcher_cpp::dumpJob(int job_id)
619 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot dump job!!!");
620 throw LauncherException("Method Launcher_cpp::dumpJob is not available "
621 "(libBatch was not present at compilation time)");
626 Launcher_cpp::restoreJob(const std::string& dumpedJob)
628 LAUNCHER_INFOS("Launcher compiled without LIBBATCH - cannot restore job!!!");
629 throw LauncherException("Method Launcher_cpp::restoreJob is not available "
630 "(libBatch was not present at compilation time)");
635 Launcher_cpp::createJobWithFile( const std::string xmlExecuteFile, std::string clusterName)
637 throw LauncherException("Method Launcher_cpp::createJobWithFile is not available "
638 "(libBatch was not present at compilation time)");
645 Launcher_cpp::ParseXmlFile(std::string xmlExecuteFile)
647 ParserLauncherType job_params;
648 SALOME_Launcher_Handler * handler = new SALOME_Launcher_Handler(job_params);
650 const char* aFilePath = xmlExecuteFile.c_str();
651 FILE* aFile = fopen(aFilePath, "r");
654 xmlDocPtr aDoc = xmlReadFile(aFilePath, NULL, 0);
656 handler->ProcessXmlDocument(aDoc);
659 std::string message = "ResourcesManager_cpp: could not parse file: " + xmlExecuteFile;
660 LAUNCHER_MESSAGE(message);
662 throw LauncherException(message);
670 std::string message = "ResourcesManager_cpp: file is not readable: " + xmlExecuteFile;
671 LAUNCHER_MESSAGE(message);
673 throw LauncherException(message);
681 std::map<int, Launcher::Job *>
682 Launcher_cpp::getJobs()
684 return _launcher_job_map;
689 Launcher_cpp::getBatchManager(Launcher::Job * job)
691 Batch::BatchManager* result = nullptr;
692 int job_id = job->getNumber();
694 // Select a resource for the job
695 std::vector<std::string> ResourceList;
696 resourceParams params = job->getResourceRequiredParams();
697 // Consider only resources that can launch batch jobs
698 params.can_launch_batch_jobs = true;
701 ResourceList = _ResManager->GetFittingResources(params);
703 catch(const ResourcesException &ex)
705 throw LauncherException(ex.msg.c_str());
707 if (ResourceList.size() == 0)
709 LAUNCHER_INFOS("No adequate resource found for the job, number " << job->getNumber());
710 job->setState("ERROR");
711 throw LauncherException("No resource found the job");
714 // Configure the job with the resource selected - the first of the list
715 ParserResourcesType resource_definition = _ResManager->GetResourcesDescr(ResourceList[0]);
717 // Set resource definition to the job
718 // The job will check if the definitions needed
721 job->setResourceDefinition(resource_definition);
723 catch(const LauncherException &ex)
725 LAUNCHER_INFOS("Error in the definition of the resource, mess: " << ex.msg);
726 job->setState("ERROR");
730 // Step 2: We can now add a Factory if the resource is correctly define
731 std::map<int, Batch::BatchManager *>::const_iterator it = _batchmap.find(job_id);
732 if(it == _batchmap.end())
736 // Warning cannot write on one line like this, because map object is constructed before
737 // the method is called...
738 //_batchmap[job_id] = FactoryBatchManager(resource_definition);
739 result = FactoryBatchManager(resource_definition);
740 _batchmap[job_id] = result;
742 catch(const LauncherException &ex)
744 LAUNCHER_INFOS("Error during creation of the batch manager of the job, mess: " << ex.msg);
747 catch(const Batch::GenericException &ex)
749 LAUNCHER_INFOS("Error during creation of the batch manager of the job, mess: " << ex.message);
750 throw LauncherException(ex.message);
760 Launcher_cpp::addJobDirectlyToMap(Launcher::Job * new_job)
762 // Step 0: Calculated job_id
763 new_job->setNumber(_job_cpt);
767 // Step 1: check if resource is already in the map
768 Batch::BatchManager * bm = getBatchManager(new_job);
770 // Step 2: add the job to the batch manager
773 Batch::JobId batch_manager_job_id = bm->addJob(*(new_job->getBatchJob()),
774 new_job->getReference());
775 new_job->setBatchManagerJobId(batch_manager_job_id);
777 catch(const Batch::GenericException &ex)
779 LAUNCHER_INFOS("Job cannot be added, exception in addJob: " << ex.message);
780 throw LauncherException(ex.message.c_str());
783 // Step 3: add job to launcher map
784 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(new_job->getNumber());
785 if (it_job == _launcher_job_map.end())
787 _launcher_job_map[new_job->getNumber()] = new_job;
791 LAUNCHER_INFOS("A job as already the same id: " << new_job->getNumber());
793 throw LauncherException("A job as already the same id - job is not created !");
795 LAUNCHER_MESSAGE("New job added");
800 Launcher_cpp::addJob(Launcher::Job * new_job)
802 string job_state = new_job->getState();
804 if (job_state == "CREATED")
806 // In this case, we ignore run_part information
808 jobId = new_job->getNumber();
810 else if (job_state == "QUEUED" ||
811 job_state == "RUNNING" ||
812 job_state == "IN_PROCESS" ||
813 job_state == "PAUSED")
815 addJobDirectlyToMap(new_job);
816 jobId = new_job->getNumber();
818 // We check that the BatchManager could resume the job
820 if (new_job->getBatchManagerJobId().getReference() != new_job->getReference())
822 LAUNCHER_INFOS("BatchManager type cannot resume a job - job state is set to ERROR");
823 new_job->setState("ERROR");
827 else if (job_state == "FINISHED" ||
828 job_state == "FAILED" ||
829 job_state == "ERROR")
831 // We add run_part information
832 addJobDirectlyToMap(new_job);
833 jobId = new_job->getNumber();
837 LAUNCHER_INFOS("A bad job is found, state unknown " << job_state);
844 Launcher_cpp::loadJobs(const char* jobs_file)
846 list<int> new_jobs_id_list;
848 // Load the jobs from XML file
849 list<Launcher::Job *> jobs_list = Launcher::XML_Persistence::loadJobs(jobs_file);
851 // Create each job in the launcher
852 list<Launcher::Job *>::const_iterator it_job;
853 for (it_job = jobs_list.begin(); it_job != jobs_list.end(); it_job++)
855 Launcher::Job * new_job = *it_job;
859 jobId = addJob(new_job);
861 new_jobs_id_list.push_back(jobId);
865 catch(const LauncherException &ex)
867 LAUNCHER_INFOS("Cannot load the job. Exception: " << ex.msg.c_str());
872 return new_jobs_id_list;
876 Launcher_cpp::saveJobs(const char* jobs_file)
878 // Create a sorted list from the internal job map
879 list<const Launcher::Job *> jobs_list;
881 for (int i=0; i<_job_cpt; i++)
883 map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(i);
884 if (it_job != _launcher_job_map.end())
885 jobs_list.push_back(it_job->second);
888 // Save the jobs in XML file
889 Launcher::XML_Persistence::saveJobs(jobs_file, jobs_list);
893 Launcher_cpp::findJob(int job_id)
895 std::map<int, Launcher::Job *>::const_iterator it_job = _launcher_job_map.find(job_id);
896 if (it_job == _launcher_job_map.end())
898 LAUNCHER_INFOS("Cannot find the job, is it created ? job number: " << job_id);
899 throw LauncherException("Cannot find the job, is it created ?");
901 Launcher::Job * job = it_job->second;