From e29d7799260656fe3a90a34acc5f8ed05abbd119 Mon Sep 17 00:00:00 2001 From: secher Date: Thu, 24 Apr 2008 10:04:33 +0000 Subject: [PATCH] move buildSalomeCouplingScript from BatchLigth classes to Launcher --- src/Launcher/BatchLight_BatchManager.cxx | 32 +--- src/Launcher/BatchLight_BatchManager.hxx | 4 +- src/Launcher/BatchLight_BatchManager_PBS.cxx | 153 +----------------- src/Launcher/BatchLight_BatchManager_PBS.hxx | 3 +- .../BatchLight_BatchManager_SLURM.cxx | 95 +++-------- .../BatchLight_BatchManager_SLURM.hxx | 3 +- src/Launcher/BatchLight_Job.cxx | 25 ++- src/Launcher/BatchLight_Job.hxx | 6 +- src/Launcher/Launcher.cxx | 150 ++++++++++++++++- src/Launcher/Launcher.hxx | 3 + src/Launcher/Makefile.am | 5 +- src/Launcher/MpiImpl.cxx | 39 +++++ src/Launcher/MpiImpl.hxx | 19 +++ src/ResourcesManager/Makefile.am | 3 +- .../SALOME_ResourcesCatalog_Handler.cxx | 4 +- .../SALOME_ResourcesCatalog_Parser.hxx | 4 +- 16 files changed, 279 insertions(+), 269 deletions(-) diff --git a/src/Launcher/BatchLight_BatchManager.cxx b/src/Launcher/BatchLight_BatchManager.cxx index d615126bd..c1948bbc0 100644 --- a/src/Launcher/BatchLight_BatchManager.cxx +++ b/src/Launcher/BatchLight_BatchManager.cxx @@ -68,17 +68,11 @@ namespace BatchLight { { int id; - // temporary directory on cluster to put input files for job - setDirForTmpFiles(job); - // export input files on cluster exportInputFiles(job); - // build salome coupling script for job - buildSalomeCouplingScript(job); - // build batch script for job - buildSalomeBatchScript(job); + buildBatchScript(job); // submit job on cluster id = submit(job); @@ -88,28 +82,6 @@ namespace BatchLight { return id; } - void BatchManager::setDirForTmpFiles(BatchLight::Job* job) - { - std::string dirForTmpFiles; - std::string thedate; - - // Adding date to the directory name - Batch::Date date = Batch::Date(time(0)); - thedate = date.str(); - int lend = thedate.size() ; - int i = 0 ; - while ( i < lend ) { - if ( thedate[i] == '/' || thedate[i] == '-' || thedate[i] == ':' ) { - thedate[i] = '_' ; - } - i++ ; - } - - dirForTmpFiles += string("Batch/"); - dirForTmpFiles += thedate ; - job->setDirForTmpFiles(dirForTmpFiles); - } - void BatchManager::exportInputFiles(BatchLight::Job* job) throw(BatchException) { int status; @@ -278,6 +250,8 @@ namespace BatchLight { return new MpiImpl_MPICH2(); else if(mpiImpl == "openmpi") return new MpiImpl_OPENMPI(); + else if(mpiImpl == "slurm") + return new MpiImpl_SLURM(); else if(mpiImpl == "indif") throw BatchException("you must specify a mpi implementation in CatalogResources.xml file"); else{ diff --git a/src/Launcher/BatchLight_BatchManager.hxx b/src/Launcher/BatchLight_BatchManager.hxx index 65eb3d44f..753b06b56 100644 --- a/src/Launcher/BatchLight_BatchManager.hxx +++ b/src/Launcher/BatchLight_BatchManager.hxx @@ -88,10 +88,8 @@ namespace BatchLight { std::map _jobmap; virtual int submit(BatchLight::Job* job) throw(BatchException) = 0; - void setDirForTmpFiles(BatchLight::Job* job); void exportInputFiles(BatchLight::Job* job) throw(BatchException); - virtual void buildSalomeCouplingScript(BatchLight::Job* job) throw(BatchException) = 0; - virtual void buildSalomeBatchScript(BatchLight::Job* job) throw(BatchException) = 0; + virtual void buildBatchScript(BatchLight::Job* job) throw(BatchException) = 0; std::string BuildTemporaryFileName() const; void RmTmpFile(std::string & TemporaryFileName); diff --git a/src/Launcher/BatchLight_BatchManager_PBS.cxx b/src/Launcher/BatchLight_BatchManager_PBS.cxx index 3f2f4bc49..39e6e9fd6 100644 --- a/src/Launcher/BatchLight_BatchManager_PBS.cxx +++ b/src/Launcher/BatchLight_BatchManager_PBS.cxx @@ -160,147 +160,7 @@ namespace BatchLight { return jstatus; } - void BatchManager_PBS::buildSalomeCouplingScript(BatchLight::Job* job) throw(BatchException) - { - int status; - const string fileToExecute = job->getFileToExecute(); - const std::string dirForTmpFiles = job->getDirForTmpFiles(); - int idx = dirForTmpFiles.find("Batch/"); - std::string filelogtemp = dirForTmpFiles.substr(idx+6, dirForTmpFiles.length()); - - string::size_type p1 = fileToExecute.find_last_of("/"); - string::size_type p2 = fileToExecute.find_last_of("."); - std::string fileNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); - std::string TmpFileName = BuildTemporaryFileName(); - - ofstream tempOutputFile; - tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); - - // Begin - tempOutputFile << "#! /bin/sh -f" << endl ; - tempOutputFile << "cd " ; - tempOutputFile << _params.applipath << endl ; - tempOutputFile << "export SALOME_BATCH=1\n"; - tempOutputFile << "export PYTHONPATH=~/" ; - tempOutputFile << dirForTmpFiles ; - tempOutputFile << ":$PYTHONPATH" << endl ; - - // Test node rank - tempOutputFile << "if test " ; - tempOutputFile << _mpiImpl->rank() ; - tempOutputFile << " = 0; then" << endl ; - - // ----------------------------------------------- - // Code for rank 0 : launch runAppli and a container - // RunAppli - tempOutputFile << " ./runAppli --terminal --modules=" ; - for ( int i = 0 ; i < _params.modulesList.size() ; i++ ) { - tempOutputFile << _params.modulesList[i] ; - if ( i != _params.modulesList.size()-1 ) - tempOutputFile << "," ; - } - tempOutputFile << " --standalone=registry,study,moduleCatalog --ns-port-log=" - << filelogtemp - << " &\n"; - - // Wait NamingService - tempOutputFile << " current=0\n" - << " stop=20\n" - << " while ! test -f " << filelogtemp << "\n" - << " do\n" - << " sleep 2\n" - << " let current=current+1\n" - << " if [ \"$current\" -eq \"$stop\" ] ; then\n" - << " echo Error Naming Service failed ! >&2" - << " exit\n" - << " fi\n" - << " done\n" - << " port=`cat " << filelogtemp << "`\n"; - - // Launch a container - tempOutputFile << " ./runSession SALOME_Container 'YACS_Server_'" - << _mpiImpl->rank() - << " > ~/" << dirForTmpFiles << "/YACS_Server_" - << _mpiImpl->rank() << "_container_log." << filelogtemp - << " 2>&1 &\n"; - - // Wait other containers - tempOutputFile << " for ((ip=0; ip < "; - tempOutputFile << _mpiImpl->size(); - tempOutputFile << " ; ip++))" << endl; - tempOutputFile << " do" << endl ; - tempOutputFile << " arglist=\"$arglist YACS_Server_\"$ip" << endl ; - tempOutputFile << " done" << endl ; - tempOutputFile << " sleep 5" << endl ; - tempOutputFile << " ./runSession waitContainers.py $arglist" << endl ; - - // Launch user script - tempOutputFile << " ./runSession python ~/" << dirForTmpFiles << "/" << fileNameToExecute << ".py\n"; - - // Stop application - tempOutputFile << " rm " << filelogtemp << "\n" - << " ./runSession killSalomeWithPort.py $port\n"; - - // ------------------------------------- - // Other nodes launch a container - tempOutputFile << "else" << endl ; - - // Wait NamingService - tempOutputFile << " current=0\n" - << " stop=20\n" - << " while ! test -f " << filelogtemp << "\n" - << " do\n" - << " sleep 2\n" - << " let current=current+1\n" - << " if [ \"$current\" -eq \"$stop\" ] ; then\n" - << " echo Error Naming Service failed ! >&2" - << " exit\n" - << " fi\n" - << " done\n" - << " port=`cat " << filelogtemp << "`\n"; - - // Launching container - tempOutputFile << " ./runSession SALOME_Container 'YACS_Server_'"; - tempOutputFile << _mpiImpl->rank() - << " > ~/" << dirForTmpFiles << "/YACS_Server_" - << _mpiImpl->rank() << "_container_log." << filelogtemp - << " 2>&1\n"; - tempOutputFile << "fi" << endl; - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(TmpFileName.c_str(), 0x1ED); - cerr << TmpFileName.c_str() << endl; - - string command; - if( _params.protocol == "rsh" ) - command = "rcp "; - else if( _params.protocol == "ssh" ) - command = "scp "; - else - throw BatchException("Unknown protocol"); - - command += TmpFileName; - command += " "; - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += ":"; - command += dirForTmpFiles ; - command += "/runSalome_" ; - command += fileNameToExecute ; - command += "_Batch.sh" ; - cerr << fileNameToExecute << endl; - cerr << command.c_str() << endl; - status = system(command.c_str()); - if(status) - throw BatchException("Error of connection on remote host"); - RmTmpFile(TmpFileName); - - } - - void BatchManager_PBS::buildSalomeBatchScript(BatchLight::Job* job) throw(BatchException) + void BatchManager_PBS::buildBatchScript(BatchLight::Job* job) throw(BatchException) { int status; const int nbproc = job->getNbProc(); @@ -310,7 +170,9 @@ namespace BatchLight { const string fileToExecute = job->getFileToExecute(); string::size_type p1 = fileToExecute.find_last_of("/"); string::size_type p2 = fileToExecute.find_last_of("."); - std::string fileNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + std::string rootNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + std::string fileNameToExecute = "~/" + dirForTmpFiles + "/" + string(basename(fileToExecute.c_str())); + int idx = dirForTmpFiles.find("Batch/"); std::string filelogtemp = dirForTmpFiles.substr(idx+6, dirForTmpFiles.length()); @@ -330,9 +192,6 @@ namespace BatchLight { ofstream tempOutputFile; tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); - ostringstream filenameToExecute; - filenameToExecute << " ~/" << dirForTmpFiles << "/runSalome_" << fileNameToExecute << "_Batch.sh"; - tempOutputFile << "#! /bin/sh -f" << endl ; tempOutputFile << "#PBS -l nodes=" << nbnodes << endl ; if (edt != "") @@ -346,7 +205,7 @@ namespace BatchLight { tempOutputFile << "#PBS -o runSalome.output.log." << filelogtemp << endl ; tempOutputFile << "#PBS -e runSalome.error.log." << filelogtemp << endl ; tempOutputFile << _mpiImpl->boot("${PBS_NODEFILE}",nbnodes); - tempOutputFile << _mpiImpl->run("${PBS_NODEFILE}",nbproc,filenameToExecute.str()); + tempOutputFile << _mpiImpl->run("${PBS_NODEFILE}",nbproc,fileNameToExecute); tempOutputFile << _mpiImpl->halt(); tempOutputFile.flush(); tempOutputFile.close(); @@ -370,7 +229,7 @@ namespace BatchLight { command += ":"; command += dirForTmpFiles ; command += "/" ; - command += fileNameToExecute ; + command += rootNameToExecute ; command += "_Batch.sh" ; cerr << command.c_str() << endl; status = system(command.c_str()); diff --git a/src/Launcher/BatchLight_BatchManager_PBS.hxx b/src/Launcher/BatchLight_BatchManager_PBS.hxx index 3f23f21f7..28c62580e 100644 --- a/src/Launcher/BatchLight_BatchManager_PBS.hxx +++ b/src/Launcher/BatchLight_BatchManager_PBS.hxx @@ -48,8 +48,7 @@ namespace BatchLight { std::string queryJob(const int & jobid); // renvoie l'etat du job private: - void buildSalomeCouplingScript(BatchLight::Job* job) throw(BatchException); - void buildSalomeBatchScript(BatchLight::Job* job) throw(BatchException); + void buildBatchScript(BatchLight::Job* job) throw(BatchException); int submit(BatchLight::Job* job) throw(BatchException); // Permet d'avoir la chaîne complête pour demander diff --git a/src/Launcher/BatchLight_BatchManager_SLURM.cxx b/src/Launcher/BatchLight_BatchManager_SLURM.cxx index 0c72d8b6b..3aefaee9a 100644 --- a/src/Launcher/BatchLight_BatchManager_SLURM.cxx +++ b/src/Launcher/BatchLight_BatchManager_SLURM.cxx @@ -40,6 +40,7 @@ namespace BatchLight { // Constructeur BatchManager_SLURM::BatchManager_SLURM(const clusterParams& p) throw(BatchException) : BatchManager(p) { + _mpiImpl = FactoryMpiImpl(_params.mpiImpl); } // Destructeur @@ -138,95 +139,41 @@ namespace BatchLight { return jstatus; } - void BatchManager_SLURM::buildSalomeCouplingScript(BatchLight::Job* job) throw(BatchException) + void BatchManager_SLURM::buildBatchScript(BatchLight::Job* job) throw(BatchException) { int status; - const string fileToExecute = job->getFileToExecute(); + const int nbproc = job->getNbProc(); + std::string edt = job->getExpectedDuringTime(); + std::string mem = job->getMemory(); const std::string dirForTmpFiles = job->getDirForTmpFiles(); - + const string fileToExecute = job->getFileToExecute(); string::size_type p1 = fileToExecute.find_last_of("/"); string::size_type p2 = fileToExecute.find_last_of("."); - std::string fileNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + std::string rootNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + std::string fileNameToExecute = "~/" + dirForTmpFiles + "/" + string(basename(fileToExecute.c_str())); - std::string TmpFileName = BuildTemporaryFileName(); - ofstream tempOutputFile; - tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); - tempOutputFile << "#! /bin/sh -f" << endl ; - tempOutputFile << "cd " ; - tempOutputFile << _params.applipath << endl ; - tempOutputFile << "export PYTHONPATH=~/" ; - tempOutputFile << dirForTmpFiles ; - tempOutputFile << ":$PYTHONPATH" << endl ; - tempOutputFile << "if test $SLURM_PROCID = 0; then" << endl ; - tempOutputFile << " ./runAppli --terminal --modules=" ; - for ( int i = 0 ; i < _params.modulesList.size() ; i++ ) { - tempOutputFile << _params.modulesList[i] ; - if ( i != _params.modulesList.size()-1 ) - tempOutputFile << "," ; + int nbmaxproc = _params.nbnodes * _params.nbprocpernode; + if( nbproc > nbmaxproc ){ + cerr << nbproc << " processors asked on a cluster of " << nbmaxproc << " processors" << endl; + throw BatchException("Too much processors asked for that cluster"); } - tempOutputFile << " --standalone=registry,study,moduleCatalog --killall &" << endl ; - tempOutputFile << " for ((ip=1; ip < ${SLURM_NPROCS} ; ip++))" << endl; - tempOutputFile << " do" << endl ; - tempOutputFile << " arglist=\"$arglist YACS_Server_\"$ip" << endl ; - tempOutputFile << " done" << endl ; - tempOutputFile << " ./runSession waitNS.sh" << endl ; - tempOutputFile << " ./runSession waitContainers.py $arglist" << endl ; - tempOutputFile << " ./runSession python ~/" << dirForTmpFiles << "/" << fileNameToExecute << ".py" << endl; - tempOutputFile << " ./runSession killCurrentPort" << endl; - tempOutputFile << "else" << endl ; - tempOutputFile << " ./runSession waitNS.sh" << endl ; - tempOutputFile << " ./runSession SALOME_Container 'YACS_Server_'${SLURM_PROCID}" << endl ; - tempOutputFile << "fi" << endl ; - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(TmpFileName.c_str(), 0x1ED); - cerr << TmpFileName.c_str() << endl; - string command; - if( _params.protocol == "rsh" ) - command = "rcp "; - else if( _params.protocol == "ssh" ) - command = "scp "; + int nbnodes; + if( nbproc < _params.nbnodes ) + nbnodes = nbproc; else - throw BatchException("Unknown protocol"); - - command += TmpFileName; - command += " "; - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += ":"; - command += dirForTmpFiles ; - command += "/runSalome_" ; - command += fileNameToExecute ; - command += "_Batch.sh" ; - cerr << command.c_str() << endl; - status = system(command.c_str()); - if(status) - throw BatchException("Error of connection on remote host"); - RmTmpFile(TmpFileName); - - } + nbnodes = _params.nbnodes; - void BatchManager_SLURM::buildSalomeBatchScript(BatchLight::Job* job) throw(BatchException) - { - int status; - const int nbproc = job->getNbProc(); - const std::string dirForTmpFiles = job->getDirForTmpFiles(); std::string TmpFileName = BuildTemporaryFileName(); ofstream tempOutputFile; tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); - const string fileToExecute = job->getFileToExecute(); - string::size_type p1 = fileToExecute.find_last_of("/"); - string::size_type p2 = fileToExecute.find_last_of("."); - std::string fileNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); tempOutputFile << "#! /bin/sh -f" << endl ; tempOutputFile << "#BSUB -n " << nbproc << endl ; tempOutputFile << "#BSUB -o " << dirForTmpFiles << "/runSalome.log%J" << endl ; - tempOutputFile << "srun ~/" << dirForTmpFiles << "/runSalome_" << fileNameToExecute << "_Batch.sh" << endl ; + tempOutputFile << _mpiImpl->boot("",nbproc); + tempOutputFile << _mpiImpl->run("",nbproc,fileNameToExecute); + tempOutputFile << _mpiImpl->halt(); tempOutputFile.flush(); tempOutputFile.close(); chmod(TmpFileName.c_str(), 0x1ED); @@ -249,9 +196,9 @@ namespace BatchLight { command += ":"; command += dirForTmpFiles ; command += "/" ; - command += fileNameToExecute ; + command += rootNameToExecute ; command += "_Batch.sh" ; - cerr << command.c_str() << endl; + cerr << command.c_str() << endl; status = system(command.c_str()); if(status) throw BatchException("Error of connection on remote host"); diff --git a/src/Launcher/BatchLight_BatchManager_SLURM.hxx b/src/Launcher/BatchLight_BatchManager_SLURM.hxx index 72ce92624..97d2df190 100644 --- a/src/Launcher/BatchLight_BatchManager_SLURM.hxx +++ b/src/Launcher/BatchLight_BatchManager_SLURM.hxx @@ -48,8 +48,7 @@ namespace BatchLight { std::string queryJob(const int & jobid); // renvoie l'etat du job protected: - void buildSalomeCouplingScript(BatchLight::Job* job) throw(BatchException); - void buildSalomeBatchScript(BatchLight::Job* job) throw(BatchException); + void buildBatchScript(BatchLight::Job* job) throw(BatchException); int submit(BatchLight::Job* job) throw(BatchException); private: diff --git a/src/Launcher/BatchLight_Job.cxx b/src/Launcher/BatchLight_Job.cxx index 494693e8a..b2750d892 100644 --- a/src/Launcher/BatchLight_Job.cxx +++ b/src/Launcher/BatchLight_Job.cxx @@ -26,6 +26,7 @@ * */ +#include "Batch_Date.hxx" #include "BatchLight_Job.hxx" #include @@ -40,8 +41,22 @@ Job::Job(const string fileToExecute, _filesToImport(filesToImport), _batch_params(batch_params) { - _dirForTmpFiles = "/tmp/default_batch_tmp_directory"; - std::string _fileNameToExecute = ""; + std::string thedate; + + // Adding date to the directory name + Batch::Date date = Batch::Date(time(0)); + thedate = date.str(); + int lend = thedate.size() ; + int i = 0 ; + while ( i < lend ) { + if ( thedate[i] == '/' || thedate[i] == '-' || thedate[i] == ':' ) { + thedate[i] = '_' ; + } + i++ ; + } + + _dirForTmpFiles = string("Batch/"); + _dirForTmpFiles += thedate ; } Job::~Job() @@ -49,6 +64,12 @@ Job::~Job() cerr << "Job destructor" << endl; } +void +Job::addFileToExportList(std::string file_name) +{ + _filesToExport.push_back(file_name); +} + void Job::addFileToImportList(std::string file_name) { diff --git a/src/Launcher/BatchLight_Job.hxx b/src/Launcher/BatchLight_Job.hxx index 2d1142053..3287adc66 100644 --- a/src/Launcher/BatchLight_Job.hxx +++ b/src/Launcher/BatchLight_Job.hxx @@ -45,8 +45,10 @@ namespace BatchLight { virtual ~Job(); const std::string getFileToExecute() const { return _fileToExecute; } + void setFileToExecute(const std::string fileToExecute) { _fileToExecute=fileToExecute; } const std::vector getFilesToExportList() const { return _filesToExport; } const std::vector getFilesToImportList() const { return _filesToImport; } + void addFileToExportList(std::string file_name); void addFileToImportList(std::string file_name); const long getNbProc() const { return _batch_params.nb_proc; } const std::string getExpectedDuringTime(); @@ -57,8 +59,8 @@ namespace BatchLight { std::cerr << _dirForTmpFiles << std::endl;} bool check(); protected: - const std::string _fileToExecute; - const std::vector _filesToExport; + std::string _fileToExecute; + std::vector _filesToExport; std::vector _filesToImport; batchParams _batch_params; std::string _dirForTmpFiles; // Tmp directory on the server diff --git a/src/Launcher/Launcher.cxx b/src/Launcher/Launcher.cxx index 2385de73d..0730228bd 100644 --- a/src/Launcher/Launcher.cxx +++ b/src/Launcher/Launcher.cxx @@ -21,6 +21,9 @@ #include "BatchLight_BatchManager_SLURM.hxx" #include "BatchLight_Job.hxx" #include "Launcher.hxx" +#include +#include +#include using namespace std; @@ -102,6 +105,11 @@ long Launcher_cpp::submitSalomeJob( const string fileToExecute , delete job; throw LauncherException("Job parameters are bad (see informations above)"); } + + // build salome coupling script for job + buildSalomeCouplingScript(job,p); + + // submit job on cluster jobId = _batchmap[clustername]->submitJob(job); } catch(const BatchLight::BatchException &ex){ @@ -225,6 +233,9 @@ BatchLight::BatchManager *Launcher_cpp::FactoryBatchManager( const ParserResourc case openmpi: p.mpiImpl = "openmpi"; break; + case slurm: + p.mpiImpl = "slurm"; + break; default: p.mpiImpl = "indif"; break; @@ -235,7 +246,7 @@ BatchLight::BatchManager *Launcher_cpp::FactoryBatchManager( const ParserResourc case pbs: cerr << "Instantiation of PBS batch manager" << endl; return new BatchLight::BatchManager_PBS(p); - case slurm: + case lsf: cerr << "Instantiation of SLURM batch manager" << endl; return new BatchLight::BatchManager_SLURM(p); default: @@ -244,3 +255,140 @@ BatchLight::BatchManager *Launcher_cpp::FactoryBatchManager( const ParserResourc } } +void Launcher_cpp::buildSalomeCouplingScript(BatchLight::Job* job, const ParserResourcesType& params) +{ + const string fileToExecute = job->getFileToExecute(); + const std::string dirForTmpFiles = job->getDirForTmpFiles(); + int idx = dirForTmpFiles.find("Batch/"); + std::string filelogtemp = dirForTmpFiles.substr(idx+6, dirForTmpFiles.length()); + + string::size_type p1 = fileToExecute.find_last_of("/"); + string::size_type p2 = fileToExecute.find_last_of("."); + std::string fileNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + std::string TmpFileName = "/tmp/runSalome_" + fileNameToExecute + ".sh"; + + MpiImpl* mpiImpl = FactoryMpiImpl(params.mpi); + + ofstream tempOutputFile; + tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); + + // Begin + tempOutputFile << "#! /bin/sh -f" << endl ; + tempOutputFile << "cd " ; + tempOutputFile << params.AppliPath << endl ; + tempOutputFile << "export SALOME_BATCH=1\n"; + tempOutputFile << "export PYTHONPATH=~/" ; + tempOutputFile << dirForTmpFiles ; + tempOutputFile << ":$PYTHONPATH" << endl ; + + // Test node rank + tempOutputFile << "if test " ; + tempOutputFile << mpiImpl->rank() ; + tempOutputFile << " = 0; then" << endl ; + + // ----------------------------------------------- + // Code for rank 0 : launch runAppli and a container + // RunAppli + tempOutputFile << " ./runAppli --terminal --modules=" ; + for ( int i = 0 ; i < params.ModulesList.size() ; i++ ) { + tempOutputFile << params.ModulesList[i] ; + if ( i != params.ModulesList.size()-1 ) + tempOutputFile << "," ; + } + tempOutputFile << " --standalone=registry,study,moduleCatalog --ns-port-log=" + << filelogtemp + << " &\n"; + + // Wait NamingService + tempOutputFile << " current=0\n" + << " stop=20\n" + << " while ! test -f " << filelogtemp << "\n" + << " do\n" + << " sleep 2\n" + << " let current=current+1\n" + << " if [ \"$current\" -eq \"$stop\" ] ; then\n" + << " echo Error Naming Service failed ! >&2" + << " exit\n" + << " fi\n" + << " done\n" + << " port=`cat " << filelogtemp << "`\n"; + + // Wait other containers + tempOutputFile << " for ((ip=1; ip < "; + tempOutputFile << mpiImpl->size(); + tempOutputFile << " ; ip++))" << endl; + tempOutputFile << " do" << endl ; + tempOutputFile << " arglist=\"$arglist YACS_Server_\"$ip" << endl ; + tempOutputFile << " done" << endl ; + tempOutputFile << " sleep 5" << endl ; + tempOutputFile << " ./runSession waitContainers.py $arglist" << endl ; + + // Launch user script + tempOutputFile << " ./runSession python ~/" << dirForTmpFiles << "/" << fileNameToExecute << ".py" << endl; + + // Stop application + tempOutputFile << " rm " << filelogtemp << "\n" + << " ./runSession killCurrentPort" << endl; + // waiting standard killing improvement by P. Rascle + tempOutputFile << " killall notifd" << endl; + tempOutputFile << " killall omniNames" << endl; + + // ------------------------------------- + // Other nodes launch a container + tempOutputFile << "else" << endl ; + + // Wait NamingService + tempOutputFile << " current=0\n" + << " stop=20\n" + << " while ! test -f " << filelogtemp << "\n" + << " do\n" + << " sleep 2\n" + << " let current=current+1\n" + << " if [ \"$current\" -eq \"$stop\" ] ; then\n" + << " echo Error Naming Service failed ! >&2" + << " exit\n" + << " fi\n" + << " done\n" + << " port=`cat " << filelogtemp << "`\n"; + + // Launching container + tempOutputFile << " ./runSession SALOME_Container YACS_Server_"; + tempOutputFile << mpiImpl->rank() + << " > ~/" << dirForTmpFiles << "/YACS_Server_" + << mpiImpl->rank() << "_container_log." << filelogtemp + << " 2>&1\n"; + tempOutputFile << "fi" << endl ; + tempOutputFile.flush(); + tempOutputFile.close(); + chmod(TmpFileName.c_str(), 0x1ED); + cerr << TmpFileName.c_str() << endl; + + job->addFileToExportList(fileToExecute); + job->setFileToExecute(TmpFileName); + + delete mpiImpl; + +} + +MpiImpl *Launcher_cpp::FactoryMpiImpl(MpiImplType mpi) throw(LauncherException) +{ + switch(mpi){ + case lam: + return new MpiImpl_LAM(); + case mpich1: + return new MpiImpl_MPICH1(); + case mpich2: + return new MpiImpl_MPICH2(); + case openmpi: + return new MpiImpl_OPENMPI(); + case slurm: + return new MpiImpl_SLURM(); + case indif: + throw LauncherException("you must specify a mpi implementation in CatalogResources.xml file"); + default: + ostringstream oss; + oss << mpi << " : not yet implemented"; + throw LauncherException(oss.str().c_str()); + } + +} diff --git a/src/Launcher/Launcher.hxx b/src/Launcher/Launcher.hxx index b46dd0b2f..dc88799d8 100644 --- a/src/Launcher/Launcher.hxx +++ b/src/Launcher/Launcher.hxx @@ -53,6 +53,9 @@ public: void SetResourcesManager( ResourcesManager_cpp* rm ) { _ResManager = rm; } protected: + + void buildSalomeCouplingScript(BatchLight::Job* job, const ParserResourcesType& params); + MpiImpl *FactoryMpiImpl(MpiImplType mpiImpl) throw(LauncherException); BatchLight::BatchManager *FactoryBatchManager( const ParserResourcesType& params ) throw(LauncherException); std::map _batchmap; diff --git a/src/Launcher/Makefile.am b/src/Launcher/Makefile.am index 21a52f2ba..a54ef4490 100644 --- a/src/Launcher/Makefile.am +++ b/src/Launcher/Makefile.am @@ -40,8 +40,9 @@ salomeinclude_HEADERS = \ BatchLight_BatchManager_PBS.hxx \ BatchLight_BatchManager_SLURM.hxx \ BatchLight_Job.hxx \ - MpiImpl.hxx \ - SALOME_Launcher.hxx + MpiImpl.hxx \ + SALOME_Launcher.hxx \ + Launcher.hxx # Scripts to be installed dist_salomescript_DATA = diff --git a/src/Launcher/MpiImpl.cxx b/src/Launcher/MpiImpl.cxx index 1ac4074aa..012eeee73 100644 --- a/src/Launcher/MpiImpl.cxx +++ b/src/Launcher/MpiImpl.cxx @@ -209,3 +209,42 @@ string MpiImpl_OPENMPI::halt() return ""; } +// slurm implementation +// Constructor +MpiImpl_SLURM::MpiImpl_SLURM() : MpiImpl() +{ +} + +// Destructor +MpiImpl_SLURM::~MpiImpl_SLURM() +{ + cerr << "MpiImpl_SLURM destructor" << endl; +} + +string MpiImpl_SLURM::size() +{ + return "${SLURM_NPROCS}"; +} + +string MpiImpl_SLURM::rank() +{ + return "${SLURM_PROCID}"; +} + +string MpiImpl_SLURM::boot(const string machinefile, const unsigned int nbnodes) +{ + return ""; +} + +string MpiImpl_SLURM::run(const string machinefile, const unsigned int nbproc, const string fileNameToExecute) +{ + ostringstream oss; + oss << "srun " << fileNameToExecute << endl; + return oss.str(); +} + +string MpiImpl_SLURM::halt() +{ + return ""; +} + diff --git a/src/Launcher/MpiImpl.hxx b/src/Launcher/MpiImpl.hxx index 47f1283ef..07f306bdf 100644 --- a/src/Launcher/MpiImpl.hxx +++ b/src/Launcher/MpiImpl.hxx @@ -134,4 +134,23 @@ private: }; +class MpiImpl_SLURM : public MpiImpl +{ +public: + // Constructeur et destructeur + MpiImpl_SLURM(); // constructor + virtual ~MpiImpl_SLURM(); //Destructor + + std::string size(); // get number of process of current job + std::string rank(); // get process number of current job + std::string boot( const std::string machinefile, const unsigned int nbnodes); // get boot command + std::string run( const std::string machinefile, const unsigned int nbproc, const std::string fileNameToExecute); // get run command + std::string halt(); // get stop command + +protected: + +private: + +}; + #endif diff --git a/src/ResourcesManager/Makefile.am b/src/ResourcesManager/Makefile.am index 86eb62538..94db53c20 100755 --- a/src/ResourcesManager/Makefile.am +++ b/src/ResourcesManager/Makefile.am @@ -38,7 +38,8 @@ salomeinclude_HEADERS = \ SALOME_ResourcesCatalog_Parser.hxx \ SALOME_ResourcesManager.hxx \ SALOME_ResourcesCatalog_Handler.hxx \ - SALOME_LoadRateManager.hxx + SALOME_LoadRateManager.hxx \ + ResourcesManager.hxx # # =============================================================== diff --git a/src/ResourcesManager/SALOME_ResourcesCatalog_Handler.cxx b/src/ResourcesManager/SALOME_ResourcesCatalog_Handler.cxx index 49b30b700..136fd8771 100755 --- a/src/ResourcesManager/SALOME_ResourcesCatalog_Handler.cxx +++ b/src/ResourcesManager/SALOME_ResourcesCatalog_Handler.cxx @@ -187,8 +187,6 @@ void SALOME_ResourcesCatalog_Handler::ProcessXmlDocument(xmlDocPtr theDoc) _resource.Batch = pbs; else if (aBatch == "lsf") _resource.Batch = lsf; - else if (aBatch == "slurm") - _resource.Batch = slurm; else _resource.Batch = none; } @@ -206,6 +204,8 @@ void SALOME_ResourcesCatalog_Handler::ProcessXmlDocument(xmlDocPtr theDoc) _resource.mpi = mpich2; else if (anMpi == "openmpi") _resource.mpi = openmpi; + else if (anMpi == "slurm") + _resource.mpi = slurm; else _resource.mpi = indif; } diff --git a/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.hxx b/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.hxx index f8af1dbcd..f452b1a6c 100755 --- a/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.hxx +++ b/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.hxx @@ -38,9 +38,9 @@ enum AccessProtocolType {rsh, ssh}; enum AccessModeType {interactive, batch}; -enum BatchType {none, pbs, lsf, slurm}; +enum BatchType {none, pbs, lsf}; -enum MpiImplType {indif, lam, mpich1, mpich2, openmpi}; +enum MpiImplType {indif, lam, mpich1, mpich2, openmpi, slurm}; class ResourceDataToSort { -- 2.39.2