From ab400e9748724d3ed4ee5d05be8d8c550c9c384c Mon Sep 17 00:00:00 2001 From: secher Date: Thu, 19 Jul 2007 08:57:11 +0000 Subject: [PATCH] get jobId when submitting job on cluster, query job and get results files on local machine when job is finished --- idl/SALOME_ContainerManager.idl | 14 +- src/Batch/BatchLight_BatchManager.cxx | 50 ++--- src/Batch/BatchLight_BatchManager.hxx | 14 +- src/Batch/BatchLight_BatchManager_SLURM.cxx | 183 ++++++++++++++---- src/Batch/BatchLight_BatchManager_SLURM.hxx | 6 +- src/Batch/BatchLight_Job.cxx | 7 +- src/Batch/BatchLight_Job.hxx | 14 +- src/Container/SALOME_ContainerManager.cxx | 80 +++++++- src/Container/SALOME_ContainerManager.hxx | 13 +- src/LifeCycleCORBA/Makefile.am | 2 + .../SALOME_ResourcesManager.cxx | 148 +++++++++++--- .../SALOME_ResourcesManager.hxx | 19 +- 12 files changed, 425 insertions(+), 125 deletions(-) diff --git a/idl/SALOME_ContainerManager.idl b/idl/SALOME_ContainerManager.idl index 487d11b9b..660e7122b 100644 --- a/idl/SALOME_ContainerManager.idl +++ b/idl/SALOME_ContainerManager.idl @@ -46,7 +46,7 @@ struct MachineParameters */ typedef sequence MachineList; typedef sequence CompoList; - typedef sequence FilesToExportList; + typedef sequence FilesList; /*! exception thrown if a computer is not found in the catalog @@ -73,10 +73,14 @@ struct MachineParameters in ResPolicy policy, in CompoList componentList ); - long batchSalomeJob( in string fileToExecute , - in FilesToExportList filesToExport , - in long NumberOfProcessors , - in MachineParameters params ) ; + long submitSalomeJob( in string fileToExecute, + in FilesList filesToExport, + in FilesList filesToImport, + in long NumberOfProcessors, + in MachineParameters params ) raises (SALOME::SALOME_Exception); + string querySalomeJob( in long jobId, in MachineParameters params ) raises (SALOME::SALOME_Exception); + void deleteSalomeJob( in long jobId, in MachineParameters params ) raises (SALOME::SALOME_Exception); + void getResultSalomeJob( in string directory, in long jobId, in MachineParameters params ) raises (SALOME::SALOME_Exception); string FindFirst(in MachineList possibleComputers); diff --git a/src/Batch/BatchLight_BatchManager.cxx b/src/Batch/BatchLight_BatchManager.cxx index 2abf24541..c69bcfbd9 100644 --- a/src/Batch/BatchLight_BatchManager.cxx +++ b/src/Batch/BatchLight_BatchManager.cxx @@ -76,7 +76,7 @@ namespace BatchLight { _dirForTmpFiles += thedate ; } - void BatchManager::exportInFiles(const char *fileToExecute, const Engines::FilesToExportList filesToExportList) throw(SALOME_Exception) + void BatchManager::exportInputFiles(const char *fileToExecute, const Engines::FilesList filesToExportList) throw(SALOME_Exception) { BEGIN_OF("BatchManager::exportInFiles"); string command = _params.protocol; @@ -147,36 +147,38 @@ namespace BatchLight { END_OF("BatchManager::exportInFiles"); } - void BatchManager::submit() throw(SALOME_Exception) + void BatchManager::importOutputFiles( const char *directory, const CORBA::Long jobId ) throw(SALOME_Exception) { - BEGIN_OF("BatchManager::submit"); + BEGIN_OF("BatchManager::importOutputFiles"); string command; int status; - if( _params.protocol == "rsh" ) - command = "rsh "; - else if( _params.protocol == "ssh" ) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); + const BatchLight::Job* myJob = _jobmap[jobId]; + Engines::FilesList filesToImportList = myJob->getFilesToImportList(); - if (_params.username != ""){ - command += _params.username; - command += "@"; + for ( int i = 0 ; i < filesToImportList.length() ; i++ ) { + if( _params.protocol == "rsh" ) + command = "rcp "; + else if( _params.protocol == "ssh" ) + command = "scp "; + else + throw SALOME_Exception("Unknown protocol"); + if (_params.username != ""){ + command += _params.username; + command += "@"; + } + command += _params.hostname; + command += ":"; + command += filesToImportList[i] ; + command += " "; + command += directory; + SCRUTE(command.c_str()); + status = system(command.c_str()); + if(status) + throw SALOME_Exception("Error of connection on remote host"); } - command += _params.hostname; - command += " \"tcsh " ; - command += _dirForTmpFiles ; - command += "/" ; - command += _fileNameToExecute ; - command += "_bsub.sh\"" ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - END_OF("BatchManager::submit"); + END_OF("BatchManager::importOutputFiles"); } string BatchManager::BuildTemporaryFileName() const diff --git a/src/Batch/BatchLight_BatchManager.hxx b/src/Batch/BatchLight_BatchManager.hxx index 987e7c282..43f69f6b2 100644 --- a/src/Batch/BatchLight_BatchManager.hxx +++ b/src/Batch/BatchLight_BatchManager.hxx @@ -30,6 +30,7 @@ #define _BL_BATCHMANAGER_H_ #include +#include #include #include "Utils_SALOME_Exception.hxx" #include @@ -55,22 +56,23 @@ namespace BatchLight { virtual ~BatchManager(); // Methodes pour le controle des jobs : virtuelles pures - virtual const int submitJob(BatchLight::Job & job) = 0; // soumet un job au gestionnaire + virtual const int submitJob(BatchLight::Job* job) = 0; // soumet un job au gestionnaire virtual void deleteJob(const int & jobid) = 0; // retire un job du gestionnaire - virtual int queryJob(const int & jobid) = 0; // renvoie l'etat du job + virtual std::string queryJob(const int & jobid) = 0; // renvoie l'etat du job + void importOutputFiles( const char *directory, const CORBA::Long jobId ) throw(SALOME_Exception); protected: - batchParams _params; + batchParams _params; + + std::map _jobmap; std::string _dirForTmpFiles; // repertoire temporaire sur le serveur std::string _TmpFileName; std::string _fileNameToExecute; void setDirForTmpFiles(); - void exportInFiles( const char *fileToExecute, const Engines::FilesToExportList filesToExportList ) throw(SALOME_Exception); + void exportInputFiles( const char *fileToExecute, const Engines::FilesList filesToExportList ) throw(SALOME_Exception); virtual void buildSalomeCouplingScript( const char *fileToExecute ) throw(SALOME_Exception) = 0; virtual void buildSalomeBatchScript( const int nbproc ) throw(SALOME_Exception) = 0; - virtual void buildSalomeSubmitBatchScript() throw(SALOME_Exception) = 0; - void submit() throw(SALOME_Exception); std::string BuildTemporaryFileName() const; void RmTmpFile(); diff --git a/src/Batch/BatchLight_BatchManager_SLURM.cxx b/src/Batch/BatchLight_BatchManager_SLURM.cxx index ced1952b6..096325666 100644 --- a/src/Batch/BatchLight_BatchManager_SLURM.cxx +++ b/src/Batch/BatchLight_BatchManager_SLURM.cxx @@ -46,22 +46,37 @@ namespace BatchLight { // Destructeur BatchManager_SLURM::~BatchManager_SLURM() { - // Nothing to do + MESSAGE("BatchManager_SLURM destructor "<<_params.hostname); + std::map < int, const BatchLight::Job * >::const_iterator it; + for(it=_jobmap.begin();it!=_jobmap.end();it++) + delete it->second; + } // Methode pour le controle des jobs : soumet un job au gestionnaire - const int BatchManager_SLURM::submitJob(Job & job) + const int BatchManager_SLURM::submitJob(Job* job) { BEGIN_OF("BatchManager_SLURM::submitJob"); - int id=0; + int id; + // temporary directory on cluster to put input files for job setDirForTmpFiles(); SCRUTE(_dirForTmpFiles); - exportInFiles(job.getFileToExecute(),job.getFilesToExportList()); - buildSalomeCouplingScript(job.getFileToExecute()); - buildSalomeBatchScript(job.getNbProc()); - buildSalomeSubmitBatchScript(); - submit(); + + // export input files on cluster + exportInputFiles(job->getFileToExecute(),job->getFilesToExportList()); + + // build salome coupling script for job + buildSalomeCouplingScript(job->getFileToExecute()); + + // build batch script for job + buildSalomeBatchScript(job->getNbProc()); + + // submit job on cluster + id = submit(); + + // register job on map + _jobmap[id] = job; END_OF("BatchManager_SLURM::submitJob"); return id; } @@ -69,13 +84,95 @@ namespace BatchLight { // Methode pour le controle des jobs : retire un job du gestionnaire void BatchManager_SLURM::deleteJob(const int & jobid) { + BEGIN_OF("BatchManager_SLURM::deleteJob"); + string command; + int status; + ostringstream oss; + oss << jobid; + + // define command to submit batch + if( _params.protocol == "rsh" ) + command = "rsh "; + else if( _params.protocol == "ssh" ) + command = "ssh "; + else + throw SALOME_Exception("Unknown protocol"); + + if (_params.username != ""){ + command += _params.username; + command += "@"; + } + + command += _params.hostname; + command += " \"bkill " ; + command += oss.str(); + command += "\""; + SCRUTE(command.c_str()); + status = system(command.c_str()); + if(status) + throw SALOME_Exception("Error of connection on remote host"); + + MESSAGE("jobId = " << jobid << "killed"); + END_OF("BatchManager_SLURM::deleteJob"); } // Methode pour le controle des jobs : renvoie l'etat du job - int BatchManager_SLURM::queryJob(const int & jobid) + string BatchManager_SLURM::queryJob(const int & jobid) { - int ji=0; - return ji; + BEGIN_OF("BatchManager_SLURM::queryJob"); + // define name of log file + string logFile="/tmp/logs/"; + logFile += getenv("USER"); + logFile += "/batchSalome_"; + + srand ( time(NULL) ); + int ir = rand(); + ostringstream oss; + oss << ir; + logFile += oss.str(); + logFile += ".log"; + + string command; + int status; + + // define command to submit batch + if( _params.protocol == "rsh" ) + command = "rsh "; + else if( _params.protocol == "ssh" ) + command = "ssh "; + else + throw SALOME_Exception("Unknown protocol"); + + if (_params.username != ""){ + command += _params.username; + command += "@"; + } + + command += _params.hostname; + command += " \"bjobs " ; + ostringstream oss2; + oss2 << jobid; + command += oss2.str(); + command += "\" > "; + command += logFile; + SCRUTE(command.c_str()); + status = system(command.c_str()); + if(status) + throw SALOME_Exception("Error of connection on remote host"); + + // read staus of job in log file + char line[128]; + ifstream fp(logFile.c_str(),ios::in); + fp.getline(line,80,'\n'); + + string sjobid, username, jstatus; + fp >> sjobid; + fp >> username; + fp >> jstatus; + + MESSAGE("jobId = " << jobid << " " << jstatus); + END_OF("BatchManager_SLURM::queryJob"); + return jstatus; } void BatchManager_SLURM::buildSalomeCouplingScript( const char *fileToExecute ) throw(SALOME_Exception) @@ -208,53 +305,67 @@ namespace BatchLight { } - void BatchManager_SLURM::buildSalomeSubmitBatchScript() throw(SALOME_Exception) + int BatchManager_SLURM::submit() throw(SALOME_Exception) { + BEGIN_OF("BatchManager_SLURM::submit"); - BEGIN_OF("BatchManager_SLURM::buildSalomeSubmitBatchScript"); - _TmpFileName = BuildTemporaryFileName(); - int status; - ofstream tempOutputFile; - tempOutputFile.open(_TmpFileName.c_str(), ofstream::out ); + // define name of log file + string logFile="/tmp/logs/"; + logFile += getenv("USER"); + logFile += "/batchSalome_"; - tempOutputFile << "#! /bin/sh -f" << endl ; - tempOutputFile << "bsub < ~/" ; - tempOutputFile << _dirForTmpFiles ; - tempOutputFile << "/" ; - tempOutputFile << _fileNameToExecute ; - tempOutputFile << "_Batch.sh &" << endl ; - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(_TmpFileName.c_str(), 0x1ED); - SCRUTE(_TmpFileName.c_str()) ; + srand ( time(NULL) ); + int ir = rand(); + ostringstream oss; + oss << ir; + logFile += oss.str(); + logFile += ".log"; string command; + int status; + + // define command to submit batch if( _params.protocol == "rsh" ) - command = "rcp "; + command = "rsh "; else if( _params.protocol == "ssh" ) - command = "scp "; + command = "ssh "; else throw SALOME_Exception("Unknown protocol"); - command += _TmpFileName; - command += " "; + if (_params.username != ""){ command += _params.username; command += "@"; } + command += _params.hostname; - command += ":"; + command += " \"bsub < " ; command += _dirForTmpFiles ; command += "/" ; command += _fileNameToExecute ; - command += "_bsub.sh" ; + command += "_Batch.sh\" > "; + command += logFile; SCRUTE(command.c_str()); status = system(command.c_str()); if(status) - throw SALOME_Exception("Error of connection on remote host"); + throw SALOME_Exception("Error of connection on remote host"); - RmTmpFile(); - END_OF("BatchManager_SLURM::buildSalomeSubmitBatchScript"); + // read id of submitted job in log file + char line[128]; + FILE *fp = fopen(logFile.c_str(),"r"); + fgets( line, 128, fp); + fclose(fp); + string sline(line); + int p1 = sline.find("<"); + int p2 = sline.find(">"); + string strjob = sline.substr(p1+1,p2-p1-1); + + int id; + istringstream iss(strjob); + iss >> id; + + END_OF("BatchManager_SLURM::submit"); + return id; } } diff --git a/src/Batch/BatchLight_BatchManager_SLURM.hxx b/src/Batch/BatchLight_BatchManager_SLURM.hxx index 6654e3be9..7fda6e3d1 100644 --- a/src/Batch/BatchLight_BatchManager_SLURM.hxx +++ b/src/Batch/BatchLight_BatchManager_SLURM.hxx @@ -45,14 +45,14 @@ namespace BatchLight { virtual ~BatchManager_SLURM(); // Methodes pour le controle des jobs : virtuelles pures - virtual const int submitJob(BatchLight::Job & job); // soumet un job au gestionnaire + virtual const int submitJob(BatchLight::Job* job); // soumet un job au gestionnaire virtual void deleteJob(const int & jobid); // retire un job du gestionnaire - virtual int queryJob(const int & jobid); // renvoie l'etat du job + virtual std::string queryJob(const int & jobid); // renvoie l'etat du job protected: virtual void buildSalomeCouplingScript( const char *fileToExecute ) throw(SALOME_Exception); virtual void buildSalomeBatchScript( const int nbproc ) throw(SALOME_Exception); - virtual void buildSalomeSubmitBatchScript() throw(SALOME_Exception); + virtual int submit() throw(SALOME_Exception); private: diff --git a/src/Batch/BatchLight_Job.cxx b/src/Batch/BatchLight_Job.cxx index 9693aa0e8..9762a98a6 100644 --- a/src/Batch/BatchLight_Job.cxx +++ b/src/Batch/BatchLight_Job.cxx @@ -32,9 +32,14 @@ using namespace std; namespace BatchLight { // Constructeur - Job::Job(const char *fileToExecute, const Engines::FilesToExportList& filesToExport, const int nbproc) : _fileToExecute(fileToExecute), _filesToExport(filesToExport), _nbproc(nbproc) + Job::Job(const char *fileToExecute, const Engines::FilesList& filesToExport, const Engines::FilesList& filesToImport, const int nbproc) : _fileToExecute(fileToExecute), _filesToExport(filesToExport), _filesToImport(filesToImport), _nbproc(nbproc) { // Nothing to do } + Job::~Job() + { + MESSAGE("Job destructor"); + } + } diff --git a/src/Batch/BatchLight_Job.hxx b/src/Batch/BatchLight_Job.hxx index 38ef6b853..23ac8f3bb 100644 --- a/src/Batch/BatchLight_Job.hxx +++ b/src/Batch/BatchLight_Job.hxx @@ -39,16 +39,18 @@ namespace BatchLight { { public: // Constructeurs et destructeur - Job(const char *fileToExecute, const Engines::FilesToExportList& filesToExport, const int nbproc); - virtual ~Job() {} + Job(const char *fileToExecute, const Engines::FilesList& filesToExport, const Engines::FilesList& filesToImport, const int nbproc); + virtual ~Job(); - const char *getFileToExecute() { return _fileToExecute; } - const Engines::FilesToExportList getFilesToExportList() { return _filesToExport; } - const int getNbProc() { return _nbproc; } + const char *getFileToExecute() const { return _fileToExecute; } + const Engines::FilesList getFilesToExportList() const { return _filesToExport; } + const Engines::FilesList getFilesToImportList() const { return _filesToImport; } + const int getNbProc() const { return _nbproc; } protected: const char* _fileToExecute; - const Engines::FilesToExportList _filesToExport; + const Engines::FilesList _filesToExport; + const Engines::FilesList _filesToImport; const int _nbproc; private: diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index 34fccca8d..6cce6fcd4 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -328,13 +328,81 @@ GiveContainer(const Engines::MachineParameters& params, * \param params : Constraints for the choice of the batch cluster */ //============================================================================= -CORBA::Long SALOME_ContainerManager::batchSalomeJob( - const char * fileToExecute , - const Engines::FilesToExportList& filesToExport , - const CORBA::Long NumberOfProcessors , - const Engines::MachineParameters& params) +CORBA::Long SALOME_ContainerManager::submitSalomeJob( const char * fileToExecute , + const Engines::FilesList& filesToExport , + const Engines::FilesList& filesToImport , + const CORBA::Long NumberOfProcessors , + const Engines::MachineParameters& params) { - _ResManager->batchSalomeJob(fileToExecute, filesToExport, NumberOfProcessors, params); + CORBA::Long jobId; + try{ + jobId = _ResManager->submitSalomeJob(fileToExecute, filesToExport, filesToImport, NumberOfProcessors, params); + } + catch(const SALOME_Exception &ex){ + INFOS("Caught exception."); + THROW_SALOME_CORBA_EXCEPTION(ex.what(),SALOME::INTERNAL_ERROR); + } + return jobId; +} + +//============================================================================= +/*! CORBA Method: + * Query a batch job on a cluster and returns the status of job + * \param jobId : identification of Salome job + * \param params : Constraints for the choice of the batch cluster + */ +//============================================================================= +char* SALOME_ContainerManager::querySalomeJob( const CORBA::Long jobId, + const Engines::MachineParameters& params) +{ + string status; + try{ + status = _ResManager->querySalomeJob( jobId, params); + } + catch(const SALOME_Exception &ex){ + INFOS("Caught exception."); + THROW_SALOME_CORBA_EXCEPTION(ex.what(),SALOME::BAD_PARAM); + } + return CORBA::string_dup(status.c_str()); +} + +//============================================================================= +/*! CORBA Method: + * Delete a batch job on a cluster + * \param jobId : identification of Salome job + * \param params : Constraints for the choice of the batch cluster + */ +//============================================================================= +void SALOME_ContainerManager::deleteSalomeJob( const CORBA::Long jobId, + const Engines::MachineParameters& params) +{ + try{ + _ResManager->deleteSalomeJob( jobId, params); + } + catch(const SALOME_Exception &ex){ + INFOS("Caught exception."); + THROW_SALOME_CORBA_EXCEPTION(ex.what(),SALOME::BAD_PARAM); + } +} + +//============================================================================= +/*! CORBA Method: + * Get result files of job on a cluster + * \param jobId : identification of Salome job + * \param params : Constraints for the choice of the batch cluster + */ +//============================================================================= +void SALOME_ContainerManager::getResultSalomeJob( const char *directory, + const CORBA::Long jobId, + const Engines::MachineParameters& params) +{ + try{ + _ResManager->getResultSalomeJob( directory, jobId, params); + } + catch(const SALOME_Exception &ex){ + INFOS("Caught exception."); + THROW_SALOME_CORBA_EXCEPTION(ex.what(),SALOME::BAD_PARAM); + } } //============================================================================= diff --git a/src/Container/SALOME_ContainerManager.hxx b/src/Container/SALOME_ContainerManager.hxx index f64950f87..bb0ca1fd5 100644 --- a/src/Container/SALOME_ContainerManager.hxx +++ b/src/Container/SALOME_ContainerManager.hxx @@ -72,10 +72,15 @@ public: Engines::ResPolicy policy, const Engines::CompoList& componentList); - CORBA::Long batchSalomeJob(const char * fileToExecute , - const Engines::FilesToExportList& filesToExport , - const CORBA::Long NumberOfProcessors , - const Engines::MachineParameters& params); + CORBA::Long submitSalomeJob(const char * fileToExecute , + const Engines::FilesList& filesToExport , + const Engines::FilesList& filesToImport , + const CORBA::Long NumberOfProcessors , + const Engines::MachineParameters& params); + + char* querySalomeJob( const CORBA::Long jobId, const Engines::MachineParameters& params); + void deleteSalomeJob( const CORBA::Long jobId, const Engines::MachineParameters& params); + void getResultSalomeJob( const char * directory, const CORBA::Long jobId, const Engines::MachineParameters& params ); Engines::MachineList * GetFittingResources(const Engines::MachineParameters& params, diff --git a/src/LifeCycleCORBA/Makefile.am b/src/LifeCycleCORBA/Makefile.am index d6ab39e71..f4060309a 100644 --- a/src/LifeCycleCORBA/Makefile.am +++ b/src/LifeCycleCORBA/Makefile.am @@ -85,6 +85,7 @@ libSalomeLifeCycleCORBA_la_CPPFLAGS = \ $(COMMON_CPPFLAGS) \ @PYTHON_INCLUDES@ \ @QT_MT_INCLUDES@ \ + -I$(srcdir)/../Batch \ -I$(srcdir)/../Container \ -I$(srcdir)/../Notification @@ -116,6 +117,7 @@ Test_LifeCycleCORBA_LDADD = \ TestContainerManager_SOURCES = TestContainerManager.cxx TestContainerManager_CPPFLAGS =\ + -I$(srcdir)/../Batch \ -I$(srcdir)/../Registry \ -I$(srcdir)/../Notification \ $(COMMON_CPPFLAGS) diff --git a/src/ResourcesManager/SALOME_ResourcesManager.cxx b/src/ResourcesManager/SALOME_ResourcesManager.cxx index dca172f23..38d1a12dd 100644 --- a/src/ResourcesManager/SALOME_ResourcesManager.cxx +++ b/src/ResourcesManager/SALOME_ResourcesManager.cxx @@ -19,7 +19,6 @@ // #include "SALOME_ResourcesManager.hxx" #include "BatchLight_Job.hxx" -#include "BatchLight_BatchManager_SLURM.hxx" #include "Utils_ExceptHandlers.hxx" #include "OpUtil.hxx" @@ -103,6 +102,9 @@ SALOME_ResourcesManager::SALOME_ResourcesManager(CORBA::ORB_ptr orb) SALOME_ResourcesManager::~SALOME_ResourcesManager() { delete _NS; + std::map < string, const BatchLight::BatchManager * >::const_iterator it; + for(it=_batchmap.begin();it!=_batchmap.end();it++) + delete it->second; } //============================================================================= @@ -515,42 +517,132 @@ SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer * Submit a batch job on a cluster and returns the JobId * \param fileToExecute : .py/.exe/.sh/... to execute on the batch cluster * \param filesToExport : to export on the batch cluster + * \param filesToExport : to import from the batch cluster after job * \param NumberOfProcessors : Number of processors needed on the batch cluster * \param params : Constraints for the choice of the batch cluster */ //============================================================================= -CORBA::Long SALOME_ResourcesManager::batchSalomeJob( - const char * fileToExecute , - const Engines::FilesToExportList& filesToExport , - const CORBA::Long NumberOfProcessors , - const Engines::MachineParameters& params) +CORBA::Long SALOME_ResourcesManager::submitSalomeJob( const char * fileToExecute , + const Engines::FilesList& filesToExport , + const Engines::FilesList& filesToImport , + const CORBA::Long NumberOfProcessors , + const Engines::MachineParameters& params) { - BEGIN_OF("SALOME_ResourcesManager::batchSalomeJob"); + BEGIN_OF("SALOME_ResourcesManager::submitSalomeJob"); + CORBA::Long jobId; + + // find a cluster matching the structure params Engines::CompoList aCompoList ; vector aMachineList = GetFittingResources( params , aCompoList ) ; const ParserResourcesType& resInfo = _resourcesList[aMachineList[0]]; - - BatchLight::batchParams p; - p.hostname = resInfo.Alias; - if( resInfo.Protocol == rsh ) - p.protocol = "rsh"; - else if( resInfo.Protocol == ssh ) - p.protocol = "ssh"; - else - throw SALOME_Exception("Unknown protocol"); - p.username = resInfo.UserName; - p.applipath = resInfo.AppliPath; - p.modulesList = resInfo.ModulesList; - - try{ - BatchLight::Job job = BatchLight::Job( fileToExecute, filesToExport, NumberOfProcessors ); - BatchLight::BatchManager_SLURM bms = BatchLight::BatchManager_SLURM(p); - bms.submitJob(job); + string clustername = resInfo.Alias; + + // search batch manager for that cluster in map or instanciate one + std::map < string, const BatchLight::BatchManager * >::const_iterator it = _batchmap.find(clustername); + if(it == _batchmap.end()){ + // define structure for batch manager + BatchLight::batchParams p; + p.hostname = clustername; + if( resInfo.Protocol == rsh ) + p.protocol = "rsh"; + else if( resInfo.Protocol == ssh ) + p.protocol = "ssh"; + else + throw SALOME_Exception("Unknown protocol"); + p.username = resInfo.UserName; + p.applipath = resInfo.AppliPath; + p.modulesList = resInfo.ModulesList; + _batchmap[clustername] = new BatchLight::BatchManager_SLURM(p); } - catch(const SALOME_Exception &ex){ - MESSAGE(ex.what()); - } - END_OF("SALOME_ResourcesManager::batchSalomeJob"); + BatchLight::BatchManager_SLURM* bms = (BatchLight::BatchManager_SLURM*)_batchmap[clustername]; + + // submit job on cluster + BatchLight::Job* job = new BatchLight::Job( fileToExecute, filesToExport, filesToImport, NumberOfProcessors ); + jobId = bms->submitJob(job); + + return(jobId); + END_OF("SALOME_ResourcesManager::submitSalomeJob"); +} + +//============================================================================= +/*! CORBA Method: + * query a batch job on a cluster and returns the status of the job + * \param jobId : identification of Salome job + * \param params : Constraints for the choice of the batch cluster + */ +//============================================================================= +string SALOME_ResourcesManager::querySalomeJob( const CORBA::Long jobId, const Engines::MachineParameters& params) throw(SALOME_Exception) +{ + string status; + + // find a cluster matching params structure + Engines::CompoList aCompoList ; + vector aMachineList = GetFittingResources( params , aCompoList ) ; + const ParserResourcesType& resInfo = _resourcesList[aMachineList[0]]; + string clustername = resInfo.Alias; + + // search batch manager for that cluster in map + std::map < string, const BatchLight::BatchManager * >::const_iterator it = _batchmap.find(clustername); + if(it == _batchmap.end()) + throw SALOME_Exception("no batchmanager for that cluster"); + + BatchLight::BatchManager_SLURM* bms = (BatchLight::BatchManager_SLURM*)_batchmap[clustername]; + + status = bms->queryJob(jobId); + return(status); +} + + +//============================================================================= +/*! CORBA Method: + * delete a batch job on a cluster + * \param jobId : identification of Salome job + * \param params : Constraints for the choice of the batch cluster + */ +//============================================================================= +void SALOME_ResourcesManager::deleteSalomeJob( const CORBA::Long jobId, const Engines::MachineParameters& params) throw(SALOME_Exception) +{ + // find a cluster matching params structure + Engines::CompoList aCompoList ; + vector aMachineList = GetFittingResources( params , aCompoList ) ; + const ParserResourcesType& resInfo = _resourcesList[aMachineList[0]]; + string clustername = resInfo.Alias; + + // search batch manager for that cluster in map + std::map < string, const BatchLight::BatchManager * >::const_iterator it = _batchmap.find(clustername); + if(it == _batchmap.end()) + throw SALOME_Exception("no batchmanager for that cluster"); + + BatchLight::BatchManager_SLURM* bms = (BatchLight::BatchManager_SLURM*)_batchmap[clustername]; + + bms->deleteJob(jobId); +} + +//============================================================================= +/*! CORBA Method: + * delete a batch job on a cluster + * \param jobId : identification of Salome job + * \param params : Constraints for the choice of the batch cluster + */ +//============================================================================= +void SALOME_ResourcesManager::getResultSalomeJob( const char *directory, + const CORBA::Long jobId, + const Engines::MachineParameters& params) throw(SALOME_Exception) +{ + // find a cluster matching params structure + Engines::CompoList aCompoList ; + vector aMachineList = GetFittingResources( params , aCompoList ) ; + const ParserResourcesType& resInfo = _resourcesList[aMachineList[0]]; + string clustername = resInfo.Alias; + + // search batch manager for that cluster in map + std::map < string, const BatchLight::BatchManager * >::const_iterator it = _batchmap.find(clustername); + if(it == _batchmap.end()) + throw SALOME_Exception("no batchmanager for that cluster"); + + BatchLight::BatchManager_SLURM* bms = (BatchLight::BatchManager_SLURM*)_batchmap[clustername]; + + bms->importOutputFiles( directory, jobId ); } //============================================================================= diff --git a/src/ResourcesManager/SALOME_ResourcesManager.hxx b/src/ResourcesManager/SALOME_ResourcesManager.hxx index 44cb6deac..f6a622fa3 100644 --- a/src/ResourcesManager/SALOME_ResourcesManager.hxx +++ b/src/ResourcesManager/SALOME_ResourcesManager.hxx @@ -23,6 +23,7 @@ #include "Utils_SALOME_Exception.hxx" #include "utilities.h" #include +#include "BatchLight_BatchManager_SLURM.hxx" #include "SALOME_ResourcesCatalog_Handler.hxx" #include "SALOME_LoadRateManager.hxx" #include "SALOME_NamingService.hxx" @@ -31,8 +32,6 @@ #include #include - - #if defined RESOURCESMANAGER_EXPORTS #if defined WIN32 #define RESOURCESMANAGER_EXPORT __declspec( dllexport ) @@ -76,10 +75,17 @@ class RESOURCESMANAGER_EXPORT SALOME_ResourcesManager (const std::string& machine, const Engines::MachineParameters& params, const long id); - CORBA::Long batchSalomeJob(const char * fileToExecute , - const Engines::FilesToExportList& filesToExport , - const CORBA::Long NumberOfProcessors , - const Engines::MachineParameters& params); + CORBA::Long submitSalomeJob(const char * fileToExecute , + const Engines::FilesList& filesToExport , + const Engines::FilesList& filesToImport , + const CORBA::Long NumberOfProcessors , + const Engines::MachineParameters& params); + + std::string querySalomeJob( const CORBA::Long jobId, const Engines::MachineParameters& params) throw(SALOME_Exception); + void deleteSalomeJob( const CORBA::Long jobId, const Engines::MachineParameters& params) throw(SALOME_Exception); + void getResultSalomeJob( const char *directory, + const CORBA::Long jobId, + const Engines::MachineParameters& params) throw(SALOME_Exception); std::string BuildCommandToLaunchLocalContainer (const Engines::MachineParameters& params, const long id); @@ -110,6 +116,7 @@ class RESOURCESMANAGER_EXPORT SALOME_ResourcesManager protected: SALOME_NamingService *_NS; + std::map _batchmap; std::string BuildTempFileToLaunchRemoteContainer (const std::string& machine, -- 2.39.2