From bf052bd7afc79f358f8bbb08082088fd3753dc92 Mon Sep 17 00:00:00 2001 From: secher Date: Tue, 17 Jul 2007 08:01:44 +0000 Subject: [PATCH] definition of BatchLight class to submit salome session on cluster --- src/Batch/BatchLight_BatchManager.cxx | 219 ++++++++++ src/Batch/BatchLight_BatchManager.hxx | 84 ++++ src/Batch/BatchLight_BatchManager_SLURM.cxx | 260 ++++++++++++ src/Batch/BatchLight_BatchManager_SLURM.hxx | 63 +++ src/Batch/BatchLight_Job.cxx | 40 ++ src/Batch/BatchLight_Job.hxx | 60 +++ src/Batch/Makefile.am | 12 +- src/Container/SALOME_ContainerManager.cxx | 60 +-- src/ResourcesManager/Makefile.am | 1 + .../SALOME_ResourcesManager.cxx | 382 ++---------------- .../SALOME_ResourcesManager.hxx | 21 +- 11 files changed, 778 insertions(+), 424 deletions(-) create mode 100644 src/Batch/BatchLight_BatchManager.cxx create mode 100644 src/Batch/BatchLight_BatchManager.hxx create mode 100644 src/Batch/BatchLight_BatchManager_SLURM.cxx create mode 100644 src/Batch/BatchLight_BatchManager_SLURM.hxx create mode 100644 src/Batch/BatchLight_Job.cxx create mode 100644 src/Batch/BatchLight_Job.hxx diff --git a/src/Batch/BatchLight_BatchManager.cxx b/src/Batch/BatchLight_BatchManager.cxx new file mode 100644 index 000000000..2abf24541 --- /dev/null +++ b/src/Batch/BatchLight_BatchManager.cxx @@ -0,0 +1,219 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * BatchManager.cxx : + * + * Auteur : Bernard SECHER - CEA/DEN + * Date : Juillet 2007 + * Projet : SALOME + * + */ + +#include +#include +#include +#include +#include "BatchLight_Job.hxx" +#include "BatchLight_BatchManager.hxx" +#include "Batch_Date.hxx" +using namespace std; + +namespace BatchLight { + + // Constructeur + BatchManager::BatchManager(const batchParams& p) throw(SALOME_Exception) : _params(p) + { + SCRUTE(_params.hostname); + SCRUTE(_params.protocol); + SCRUTE(_params.username); + // On verifie que le hostname est correct + if (!gethostbyname(_params.hostname.c_str())) { // hostname unknown from network + string msg = "hostname \""; + msg += _params.hostname; + msg += "\" unknown from the network"; + throw SALOME_Exception(msg.c_str()); + } + } + + // Destructeur + BatchManager::~BatchManager() + { + // Nothing to do + } + + void BatchManager::setDirForTmpFiles() + { + int i; + + _dirForTmpFiles = string("Batch/"); + Batch::Date date = Batch::Date(time(0)) ; + std::string thedate = date.str() ; + int lend = thedate.size() ; + i = 0 ; + while ( i < lend ) { + if ( thedate[i] == '/' || thedate[i] == '-' || thedate[i] == ':' ) { + thedate[i] = '_' ; + } + i++ ; + } + _dirForTmpFiles += thedate ; + } + + void BatchManager::exportInFiles(const char *fileToExecute, const Engines::FilesToExportList filesToExportList) throw(SALOME_Exception) + { + BEGIN_OF("BatchManager::exportInFiles"); + string command = _params.protocol; + int status; + + command += " "; + + if (_params.username != ""){ + command += _params.username; + command += "@"; + } + + command += _params.hostname; + command += " \"mkdir -p "; + command += _dirForTmpFiles ; + command += "\"" ; + SCRUTE(command.c_str()); + status = system(command.c_str()); + if(status) + throw SALOME_Exception("Error of connection on remote host"); + + if( _params.protocol == "rsh" ) + command = "rcp "; + else if( _params.protocol == "ssh" ) + command = "scp "; + else + throw SALOME_Exception("Unknown protocol"); + + command += fileToExecute; + command += " "; + + if (_params.username != ""){ + command += _params.username; + command += "@"; + } + + command += _params.hostname; + command += ":"; + command += _dirForTmpFiles ; + SCRUTE(command.c_str()); + status = system(command.c_str()); + if(status) + throw SALOME_Exception("Error of connection on remote host"); + + int i ; + for ( i = 0 ; i < filesToExportList.length() ; i++ ) { + if( _params.protocol == "rsh" ) + command = "rcp "; + else if( _params.protocol == "ssh" ) + command = "scp "; + else + throw SALOME_Exception("Unknown protocol"); + command += filesToExportList[i] ; + command += " "; + if (_params.username != ""){ + command += _params.username; + command += "@"; + } + command += _params.hostname; + command += ":"; + command += _dirForTmpFiles ; + SCRUTE(command.c_str()); + status = system(command.c_str()); + if(status) + throw SALOME_Exception("Error of connection on remote host"); + } + + END_OF("BatchManager::exportInFiles"); + } + + void BatchManager::submit() throw(SALOME_Exception) + { + BEGIN_OF("BatchManager::submit"); + string command; + int status; + + if( _params.protocol == "rsh" ) + command = "rsh "; + else if( _params.protocol == "ssh" ) + command = "ssh "; + else + throw SALOME_Exception("Unknown protocol"); + + if (_params.username != ""){ + command += _params.username; + command += "@"; + } + + command += _params.hostname; + command += " \"tcsh " ; + command += _dirForTmpFiles ; + command += "/" ; + command += _fileNameToExecute ; + command += "_bsub.sh\"" ; + SCRUTE(command.c_str()); + status = system(command.c_str()); + if(status) + throw SALOME_Exception("Error of connection on remote host"); + + END_OF("BatchManager::submit"); + } + + string BatchManager::BuildTemporaryFileName() const + { + //build more complex file name to support multiple salome session + char *temp = new char[19]; + strcpy(temp, "/tmp/command"); + strcat(temp, "XXXXXX"); +#ifndef WNT + + mkstemp(temp); +#else + + char aPID[80]; + itoa(getpid(), aPID, 10); + strcat(temp, aPID); +#endif + + string command(temp); + delete [] temp; + command += ".sh"; + return command; + } + +void BatchManager::RmTmpFile() +{ + if (_TmpFileName != "") + { + string command = "rm "; + command += _TmpFileName; + char *temp = strdup(command.c_str()); + int lgthTemp = strlen(temp); + temp[lgthTemp - 3] = '*'; + temp[lgthTemp - 2] = '\0'; + system(temp); + free(temp); + } +} + +} diff --git a/src/Batch/BatchLight_BatchManager.hxx b/src/Batch/BatchLight_BatchManager.hxx new file mode 100644 index 000000000..987e7c282 --- /dev/null +++ b/src/Batch/BatchLight_BatchManager.hxx @@ -0,0 +1,84 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * BatchManager.hxx : + * + * Auteur : Bernard SECHER - CEA/DEN + * Date : Juillet 2007 + * Projet : SALOME + * + */ + +#ifndef _BL_BATCHMANAGER_H_ +#define _BL_BATCHMANAGER_H_ + +#include +#include +#include "Utils_SALOME_Exception.hxx" +#include +#include CORBA_CLIENT_HEADER(SALOME_ContainerManager) + +namespace BatchLight { + + class Job; + + struct batchParams{ + std::string hostname; // serveur ou tourne le BatchManager + std::string protocol; // protocole d'acces au serveur: ssh ou rsh + std::string username; // username d'acces au serveur + std::string applipath; // path of apllication directory on server + std::vector modulesList; // list of Salome modules installed on server + }; + + class BatchManager + { + public: + // Constructeur et destructeur + BatchManager(const batchParams& p) throw(SALOME_Exception); // connexion a la machine host + virtual ~BatchManager(); + + // Methodes pour le controle des jobs : virtuelles pures + virtual const int submitJob(BatchLight::Job & job) = 0; // soumet un job au gestionnaire + virtual void deleteJob(const int & jobid) = 0; // retire un job du gestionnaire + virtual int queryJob(const int & jobid) = 0; // renvoie l'etat du job + + protected: + batchParams _params; + std::string _dirForTmpFiles; // repertoire temporaire sur le serveur + std::string _TmpFileName; + std::string _fileNameToExecute; + + void setDirForTmpFiles(); + void exportInFiles( const char *fileToExecute, const Engines::FilesToExportList filesToExportList ) throw(SALOME_Exception); + virtual void buildSalomeCouplingScript( const char *fileToExecute ) throw(SALOME_Exception) = 0; + virtual void buildSalomeBatchScript( const int nbproc ) throw(SALOME_Exception) = 0; + virtual void buildSalomeSubmitBatchScript() throw(SALOME_Exception) = 0; + void submit() throw(SALOME_Exception); + + std::string BuildTemporaryFileName() const; + void RmTmpFile(); + + private: + + }; + +} + +#endif diff --git a/src/Batch/BatchLight_BatchManager_SLURM.cxx b/src/Batch/BatchLight_BatchManager_SLURM.cxx new file mode 100644 index 000000000..ced1952b6 --- /dev/null +++ b/src/Batch/BatchLight_BatchManager_SLURM.cxx @@ -0,0 +1,260 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * BatchManager.cxx : + * + * Auteur : Bernard SECHER - CEA/DEN + * Date : Juillet 2007 + * Projet : SALOME + * + */ + +#include "BatchLight_BatchManager_SLURM.hxx" +#include "utilities.h" +#include "BatchLight_Job.hxx" +#include +#include +#include +#include + +using namespace std; + +namespace BatchLight { + + // Constructeur + BatchManager_SLURM::BatchManager_SLURM(const batchParams& p) throw(SALOME_Exception) : BatchManager(p) + { + } + + // Destructeur + BatchManager_SLURM::~BatchManager_SLURM() + { + // Nothing to do + } + + // Methode pour le controle des jobs : soumet un job au gestionnaire + const int BatchManager_SLURM::submitJob(Job & job) + { + BEGIN_OF("BatchManager_SLURM::submitJob"); + int id=0; + + setDirForTmpFiles(); + SCRUTE(_dirForTmpFiles); + exportInFiles(job.getFileToExecute(),job.getFilesToExportList()); + buildSalomeCouplingScript(job.getFileToExecute()); + buildSalomeBatchScript(job.getNbProc()); + buildSalomeSubmitBatchScript(); + submit(); + END_OF("BatchManager_SLURM::submitJob"); + return id; + } + + // Methode pour le controle des jobs : retire un job du gestionnaire + void BatchManager_SLURM::deleteJob(const int & jobid) + { + } + + // Methode pour le controle des jobs : renvoie l'etat du job + int BatchManager_SLURM::queryJob(const int & jobid) + { + int ji=0; + return ji; + } + + void BatchManager_SLURM::buildSalomeCouplingScript( const char *fileToExecute ) throw(SALOME_Exception) + { + BEGIN_OF("BatchManager_SLURM::buildSalomeCouplingScript"); + int status; + int lenf = strlen( fileToExecute ) ; + int i = lenf-1 ; + while ( i >= 0 && fileToExecute[i] != '/' ) { + i -= 1 ; + } + char * FileNameToExecute = new char[lenf-4-i] ; + strncpy(FileNameToExecute , &fileToExecute[i+1] , lenf-4-i) ; + _fileNameToExecute = string( FileNameToExecute ) ; + delete FileNameToExecute ; + SCRUTE(_fileNameToExecute) ; + + _TmpFileName = BuildTemporaryFileName(); + ofstream tempOutputFile; + tempOutputFile.open(_TmpFileName.c_str(), ofstream::out ); + tempOutputFile << "#! /bin/sh -f" << endl ; + tempOutputFile << "cd " ; + tempOutputFile << _params.applipath << endl ; + tempOutputFile << "export PYTHONPATH=~/" ; + tempOutputFile << _dirForTmpFiles ; + tempOutputFile << ":$PYTHONPATH" << endl ; + tempOutputFile << "if test $SLURM_PROCID = 0; then" << endl ; + tempOutputFile << " ./runAppli --terminal --batch --modules=" ; + for ( i = 0 ; i < _params.modulesList.size() ; i++ ) { + tempOutputFile << _params.modulesList[i] ; + if ( i != _params.modulesList.size()-1 ) + tempOutputFile << "," ; + } + tempOutputFile << " --standalone=registry,study,moduleCatalog --killall &" << endl ; + tempOutputFile << " for ((ip=1; ip < ${SLURM_NPROCS} ; ip++))" << endl; + tempOutputFile << " do" << endl ; + tempOutputFile << " arglist=\"$arglist YACS_Server_\"$ip" << endl ; + tempOutputFile << " done" << endl ; + tempOutputFile << " sleep 5" << endl ; + tempOutputFile << " ./runSession waitContainers.py $arglist" << endl ; + tempOutputFile << " ./runSession python ~/" << _dirForTmpFiles << "/" << _fileNameToExecute << ".py" << endl; + tempOutputFile << " ./runSession killCurrentPort" << endl; + tempOutputFile << "else" << endl ; + tempOutputFile << " sleep 5" << endl ; + tempOutputFile << " ./runSession waitNS.py" << endl ; + tempOutputFile << " ./runSession SALOME_Container 'YACS_Server_'${SLURM_PROCID}" << endl ; + tempOutputFile << "fi" << endl ; + tempOutputFile.flush(); + tempOutputFile.close(); + chmod(_TmpFileName.c_str(), 0x1ED); + SCRUTE(_TmpFileName.c_str()) ; + + string command; + if( _params.protocol == "rsh" ) + command = "rcp "; + else if( _params.protocol == "ssh" ) + command = "scp "; + else + throw SALOME_Exception("Unknown protocol"); + + command += _TmpFileName; + command += " "; + if (_params.username != ""){ + command += _params.username; + command += "@"; + } + command += _params.hostname; + command += ":"; + command += _dirForTmpFiles ; + command += "/runSalome_" ; + command += _fileNameToExecute ; + command += "_Batch.sh" ; + SCRUTE(command.c_str()); + status = system(command.c_str()); + if(status) + throw SALOME_Exception("Error of connection on remote host"); + RmTmpFile(); + + END_OF("BatchManager_SLURM::buildSalomeCouplingScript"); + } + + void BatchManager_SLURM::buildSalomeBatchScript( const int nbproc ) throw(SALOME_Exception) + { + BEGIN_OF("BatchManager_SLURM::buildSalomeBatchScript"); + int status; + _TmpFileName = BuildTemporaryFileName(); + ofstream tempOutputFile; + tempOutputFile.open(_TmpFileName.c_str(), ofstream::out ); + + tempOutputFile << "#! /bin/sh -f" << endl ; + tempOutputFile << "#BSUB -n " ; + tempOutputFile << nbproc << endl ; + tempOutputFile << "#BSUB -o runSalome.log%J" << endl ; + tempOutputFile << "mpirun -srun ~/" ; + tempOutputFile << _dirForTmpFiles ; + tempOutputFile << "/runSalome_" ; + tempOutputFile << _fileNameToExecute ; + tempOutputFile << "_Batch.sh" << endl ; + tempOutputFile.flush(); + tempOutputFile.close(); + chmod(_TmpFileName.c_str(), 0x1ED); + SCRUTE(_TmpFileName.c_str()) ; + + string command; + if( _params.protocol == "rsh" ) + command = "rcp "; + else if( _params.protocol == "ssh" ) + command = "scp "; + else + throw SALOME_Exception("Unknown protocol"); + command += _TmpFileName; + command += " "; + if (_params.username != ""){ + command += _params.username; + command += "@"; + } + command += _params.hostname; + command += ":"; + command += _dirForTmpFiles ; + command += "/" ; + command += _fileNameToExecute ; + command += "_Batch.sh" ; + SCRUTE(command.c_str()); + status = system(command.c_str()); + if(status) + throw SALOME_Exception("Error of connection on remote host"); + + RmTmpFile(); + END_OF("BatchManager_SLURM::buildSalomeBatchScript"); + + } + + void BatchManager_SLURM::buildSalomeSubmitBatchScript() throw(SALOME_Exception) + { + + BEGIN_OF("BatchManager_SLURM::buildSalomeSubmitBatchScript"); + _TmpFileName = BuildTemporaryFileName(); + int status; + ofstream tempOutputFile; + tempOutputFile.open(_TmpFileName.c_str(), ofstream::out ); + + tempOutputFile << "#! /bin/sh -f" << endl ; + tempOutputFile << "bsub < ~/" ; + tempOutputFile << _dirForTmpFiles ; + tempOutputFile << "/" ; + tempOutputFile << _fileNameToExecute ; + tempOutputFile << "_Batch.sh &" << endl ; + tempOutputFile.flush(); + tempOutputFile.close(); + chmod(_TmpFileName.c_str(), 0x1ED); + SCRUTE(_TmpFileName.c_str()) ; + + string command; + if( _params.protocol == "rsh" ) + command = "rcp "; + else if( _params.protocol == "ssh" ) + command = "scp "; + else + throw SALOME_Exception("Unknown protocol"); + command += _TmpFileName; + command += " "; + if (_params.username != ""){ + command += _params.username; + command += "@"; + } + command += _params.hostname; + command += ":"; + command += _dirForTmpFiles ; + command += "/" ; + command += _fileNameToExecute ; + command += "_bsub.sh" ; + SCRUTE(command.c_str()); + status = system(command.c_str()); + if(status) + throw SALOME_Exception("Error of connection on remote host"); + + RmTmpFile(); + END_OF("BatchManager_SLURM::buildSalomeSubmitBatchScript"); + + } + +} diff --git a/src/Batch/BatchLight_BatchManager_SLURM.hxx b/src/Batch/BatchLight_BatchManager_SLURM.hxx new file mode 100644 index 000000000..6654e3be9 --- /dev/null +++ b/src/Batch/BatchLight_BatchManager_SLURM.hxx @@ -0,0 +1,63 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * BatchManager.hxx : + * + * Auteur : Bernard SECHER - CEA/DEN + * Date : Juillet 2007 + * Projet : SALOME + * + */ + +#ifndef _BL_BATCHMANAGER_SLURM_H_ +#define _BL_BATCHMANAGER_SLURM_H_ + +#include +#include "Utils_SALOME_Exception.hxx" +#include "BatchLight_BatchManager.hxx" + +namespace BatchLight { + + class Job; + + class BatchManager_SLURM : public BatchManager + { + public: + // Constructeur et destructeur + BatchManager_SLURM(const batchParams& p) throw(SALOME_Exception); // connexion a la machine host + virtual ~BatchManager_SLURM(); + + // Methodes pour le controle des jobs : virtuelles pures + virtual const int submitJob(BatchLight::Job & job); // soumet un job au gestionnaire + virtual void deleteJob(const int & jobid); // retire un job du gestionnaire + virtual int queryJob(const int & jobid); // renvoie l'etat du job + + protected: + virtual void buildSalomeCouplingScript( const char *fileToExecute ) throw(SALOME_Exception); + virtual void buildSalomeBatchScript( const int nbproc ) throw(SALOME_Exception); + virtual void buildSalomeSubmitBatchScript() throw(SALOME_Exception); + + private: + + }; + +} + +#endif diff --git a/src/Batch/BatchLight_Job.cxx b/src/Batch/BatchLight_Job.cxx new file mode 100644 index 000000000..9693aa0e8 --- /dev/null +++ b/src/Batch/BatchLight_Job.cxx @@ -0,0 +1,40 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * Job.cxx : + * + * Auteur : Bernard SECHER - CEA/DEN + * Date : Juillet 2007 + * Projet : SALOME + * + */ + +#include "BatchLight_Job.hxx" +using namespace std; + +namespace BatchLight { + + // Constructeur + Job::Job(const char *fileToExecute, const Engines::FilesToExportList& filesToExport, const int nbproc) : _fileToExecute(fileToExecute), _filesToExport(filesToExport), _nbproc(nbproc) + { + // Nothing to do + } + +} diff --git a/src/Batch/BatchLight_Job.hxx b/src/Batch/BatchLight_Job.hxx new file mode 100644 index 000000000..38ef6b853 --- /dev/null +++ b/src/Batch/BatchLight_Job.hxx @@ -0,0 +1,60 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * Job.hxx : + * + * Auteur : Bernard SECHER - CEA/DEN + * Date : Juillet 2007 + * Projet : SALOME + * + */ + +#ifndef _BL_JOB_H_ +#define _BL_JOB_H_ + +#include "utilities.h" +#include +#include CORBA_CLIENT_HEADER(SALOME_ContainerManager) + +namespace BatchLight { + + class Job + { + public: + // Constructeurs et destructeur + Job(const char *fileToExecute, const Engines::FilesToExportList& filesToExport, const int nbproc); + virtual ~Job() {} + + const char *getFileToExecute() { return _fileToExecute; } + const Engines::FilesToExportList getFilesToExportList() { return _filesToExport; } + const int getNbProc() { return _nbproc; } + + protected: + const char* _fileToExecute; + const Engines::FilesToExportList _filesToExport; + const int _nbproc; + + private: + + }; + +} + +#endif diff --git a/src/Batch/Makefile.am b/src/Batch/Makefile.am index 095173c50..f91377a67 100644 --- a/src/Batch/Makefile.am +++ b/src/Batch/Makefile.am @@ -59,7 +59,10 @@ LIB_INCLUDES = \ Batch_PyVersatile.hxx \ Batch_RunTimeException.hxx \ Batch_StringType.hxx \ - Batch_TypeMismatchException.hxx + Batch_TypeMismatchException.hxx \ + BatchLight_BatchManager.hxx \ + BatchLight_BatchManager_SLURM.hxx \ + BatchLight_Job.hxx LIB_SRC = \ @@ -91,7 +94,10 @@ LIB_SRC = \ Batch_PyVersatile.cxx \ Batch_RunTimeException.cxx \ Batch_StringType.cxx \ - Batch_TypeMismatchException.cxx + Batch_TypeMismatchException.cxx \ + BatchLight_BatchManager.cxx \ + BatchLight_BatchManager_SLURM.cxx \ + BatchLight_Job.cxx LIB_CPPFLAGS = \ @@ -203,7 +209,9 @@ libSalomeBatch_la_CPPFLAGS = \ @PYTHON_INCLUDES@ \ -I$(srcdir)/../Basics \ -I$(srcdir)/../SALOMELocalTrace \ + -I$(srcdir)/../Utils \ -I$(top_builddir)/salome_adm/unix \ + -I$(top_builddir)/idl \ $(LIB_CPPFLAGS) libSalomeBatch_la_LDFLAGS = -no-undefined -version-info=0:0:0 diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index 858ad1add..34fccca8d 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -334,65 +334,7 @@ CORBA::Long SALOME_ContainerManager::batchSalomeJob( const CORBA::Long NumberOfProcessors , const Engines::MachineParameters& params) { - BEGIN_OF("SALOME_ContainerManager::batchSalomeJob"); - // Determination provisoire de l'extension .py - // Il faudra une classe dans Utils pour gerer les Path FileNames et Extensions - int lenf = strlen( fileToExecute ) ; - if ( strcmp( &fileToExecute[lenf-3] ,".py" ) == NULL ) { - int i = lenf-1 ; - while ( i >= 0 && fileToExecute[i] != '/' ) { - i -= 1 ; - } - char * FileNameToExecute = new char[lenf-4-i] ; - strncpy(FileNameToExecute , &fileToExecute[i+1] , lenf-4-i) ; - string fileNameToExecute =string( FileNameToExecute ) ; - delete FileNameToExecute ; - SCRUTE(fileNameToExecute) ; -// Le /tmp n'est pas le meme d'un noeud a un autre ===> - //string DirForTmpFiles = string("/tmp/")+string(getenv("USER"))+string("/") ; - string DirForTmpFiles = string("Batch/") ; - Batch::Date date = Batch::Date(time(0)) ; - std::string thedate = date.str() ; - int lend = thedate.size() ; - i = 0 ; - while ( i < lend ) { - if ( thedate[i] == '/' || thedate[i] == '-' || thedate[i] == ':' ) { - thedate[i] = '_' ; - } - i++ ; - } - SCRUTE(thedate); - DirForTmpFiles += thedate ; - SCRUTE(DirForTmpFiles) ; - // Problemes avec ResourcesManager ... - // Solution pour l'instant : - // 31.05.107 : hostname : tantal - // Alias : tantale.ccc.cea.fr - Engines::CompoList aCompoList ; - Engines::MachineList aMachineList = *GetFittingResources( params , aCompoList ) ; - SCRUTE(aMachineList[0]) ; - std::string aCluster = FindFirst( aMachineList) ; - SCRUTE(aCluster) ; - //Creation of /tmp/$USER/date_hh_mn_ss/ and copy of FileNameToExecute - // and of filesToExport in that directory - _ResManager->CopyFileNamesToExecute(aCluster,DirForTmpFiles,fileToExecute,filesToExport) ; - //Creation of /tmp/$USER/date_hh_mn_ss/runSalome_'FileNameToExecute'_Batch.sh - string runSalome_Batch = _ResManager->BuildCmdrunSalomeBatch(aCluster,DirForTmpFiles,fileNameToExecute) ; - SCRUTE(runSalome_Batch) ; - //Creation of /tmp/$USER/date_hh_mn_ss/'FileNameToExecute'_Batch.sh - string FileNameToExecute_Batch = _ResManager->BuildCmdFileNameToExecute_Batch(aCluster,NumberOfProcessors,DirForTmpFiles,fileNameToExecute) ; - SCRUTE(FileNameToExecute_Batch) ; - //Creation of /tmp/$USER/date_hh_mn_ss/'FileNameToExecute'_bsub.sh - string FileNameToExecute_bsub = _ResManager->BuildCmdFileNameToExecute_bsub(aCluster,DirForTmpFiles,fileNameToExecute) ; - SCRUTE(FileNameToExecute_bsub) ; - //Launch of /tmp/$USER/date_hh_mn_ss/'FileNameToExecute'_bsub.sh on theCluster - string sshCommand = _ResManager->CmdToExecute_bsub(aCluster,DirForTmpFiles,fileNameToExecute) ; - SCRUTE(sshCommand) ; - } - else { - MESSAGE("SALOME_ContainerManager::batchSalomeJob unknown extension " << fileToExecute); - } - END_OF("SALOME_ContainerManager::batchSalomeJob"); + _ResManager->batchSalomeJob(fileToExecute, filesToExport, NumberOfProcessors, params); } //============================================================================= diff --git a/src/ResourcesManager/Makefile.am b/src/ResourcesManager/Makefile.am index eaa49f92a..6edd6a6c4 100755 --- a/src/ResourcesManager/Makefile.am +++ b/src/ResourcesManager/Makefile.am @@ -48,6 +48,7 @@ salomeinclude_HEADERS = \ # This local variable defines the list of CPPFLAGS common to all target in this package. COMMON_CPPFLAGS=\ + -I$(srcdir)/../Batch \ -I$(srcdir)/../Basics \ -I$(srcdir)/../SALOMELocalTrace \ -I$(srcdir)/../NamingService \ diff --git a/src/ResourcesManager/SALOME_ResourcesManager.cxx b/src/ResourcesManager/SALOME_ResourcesManager.cxx index 0b774d3fe..dca172f23 100644 --- a/src/ResourcesManager/SALOME_ResourcesManager.cxx +++ b/src/ResourcesManager/SALOME_ResourcesManager.cxx @@ -18,7 +18,8 @@ // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com // #include "SALOME_ResourcesManager.hxx" -//#include "SALOME_Container_i.hxx" +#include "BatchLight_Job.hxx" +#include "BatchLight_BatchManager_SLURM.hxx" #include "Utils_ExceptHandlers.hxx" #include "OpUtil.hxx" @@ -510,357 +511,46 @@ SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer } //============================================================================= -/*! - * Copy FileNameToExecute and filesToExport in DirForTmpFiles of machine - */ -//============================================================================= -void SALOME_ResourcesManager::CopyFileNamesToExecute(const std::string& machine, - const std::string& DirForTmpFiles , - const std::string& PathFileNameToExecute , - const Engines::FilesToExportList& filesToExport) throw(SALOME_Exception) -{ - BEGIN_OF("SALOME_ResourcesManager::CopyFileNamesToExecute"); - const ParserResourcesType& resInfo = _resourcesList[machine]; - string command; - int status; - - if (resInfo.Protocol == rsh) - command = "rsh "; - else if (resInfo.Protocol == ssh) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); - - if (resInfo.UserName != ""){ - command += resInfo.UserName; - command += "@"; - } - - command += resInfo.Alias; - command += " \"mkdir -p "; - command += DirForTmpFiles ; - command += "\"" ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - if (resInfo.Protocol == rsh) - command = "rcp "; - else if (resInfo.Protocol == ssh) - command = "scp "; - else - throw SALOME_Exception("Unknown protocol"); - - command += PathFileNameToExecute ; - command += " "; - - if (resInfo.UserName != ""){ - command += resInfo.UserName; - command += "@"; - } - - command += resInfo.Alias; - command += ":"; - command += DirForTmpFiles ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - int i ; - for ( i = 0 ; i < filesToExport.length() ; i++ ) { - if (resInfo.Protocol == rsh) - command = "rcp "; - else if (resInfo.Protocol == ssh) - command = "scp "; - else - throw SALOME_Exception("Unknown protocol"); - command += filesToExport[i] ; - command += " "; - if (resInfo.UserName != ""){ - command += resInfo.UserName; - command += "@"; - } - command += resInfo.Alias; - command += ":"; - command += DirForTmpFiles ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - } - - END_OF("SALOME_ResourcesManager::CopyFileNamesToExecute"); -} - -//============================================================================= -/*! - * builds the shell to create for runSalome Batch on a Cluster : - * #! /bin/sh -f - * source preReqFilePath - * export PYTHONPATH=/$HOME/Batch/date_hh_mn_ss:$PYTHONPATH - * if test $SLURM_PROCID = 0; then - * runSalome --terminal --batch --modules=ListOfModules --standalone=registry,study,moduleCatalog --execute='FileNameToExecute',killall --killall - * else - * sleep 10 - * export SALOME_BATCH="1" - * SALOME_Container "YACS_Server_"${SLURM_PROCID} - * fi - * - * with preReqFilePath of CatalogResource for Salome environnement - * with ListOfModules of CatalogResource - * with FileNameToExecute as python script to execute - */ -//============================================================================= -std::string SALOME_ResourcesManager::BuildCmdrunSalomeBatch( - const std::string& machine, - const std::string& DirForTmpFiles , - const std::string& FileNameToExecute ) throw(SALOME_Exception) -{ - BEGIN_OF("SALOME_ResourcesManager::BuildCmdrunSalomeBatch"); - int status; - _TmpFileName = BuildTemporaryFileName(); - ofstream tempOutputFile; - tempOutputFile.open(_TmpFileName.c_str(), ofstream::out ); - const ParserResourcesType& resInfo = _resourcesList[machine]; - resInfo.Print() ; - tempOutputFile << "#! /bin/sh -f" << endl ; - tempOutputFile << "cd " ; - tempOutputFile << resInfo.AppliPath << endl ; - tempOutputFile << "export PYTHONPATH=~/" ; - tempOutputFile << DirForTmpFiles ; - tempOutputFile << ":$PYTHONPATH" << endl ; - tempOutputFile << "if test $SLURM_PROCID = 0; then" << endl ; - tempOutputFile << " ./runAppli --terminal --batch --modules=" ; - int i ; - for ( i = 0 ; i < resInfo.ModulesList.size() ; i++ ) { - tempOutputFile << resInfo.ModulesList[i] ; - if ( i != resInfo.ModulesList.size()-1 ) - tempOutputFile << "," ; - } - tempOutputFile << " --standalone=registry,study,moduleCatalog --killall &" << endl ; - tempOutputFile << " for ((ip=1; ip < ${SLURM_NPROCS} ; ip++))" << endl; - tempOutputFile << " do" << endl ; - tempOutputFile << " arglist=\"$arglist YACS_Server_\"$ip" << endl ; - tempOutputFile << " done" << endl ; - tempOutputFile << " sleep 5" << endl ; - tempOutputFile << " ./runSession waitContainers.py $arglist" << endl ; - tempOutputFile << " ./runSession python ~/" << DirForTmpFiles << "/" << FileNameToExecute << ".py" << endl; - tempOutputFile << " ./runSession killCurrentPort" << endl; - tempOutputFile << "else" << endl ; - tempOutputFile << " sleep 5" << endl ; - tempOutputFile << " ./runSession waitNS.py" << endl ; - tempOutputFile << " ./runSession SALOME_Container 'YACS_Server_'${SLURM_PROCID}" << endl ; - tempOutputFile << "fi" << endl ; - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(_TmpFileName.c_str(), 0x1ED); - SCRUTE(_TmpFileName.c_str()) ; - - string command; - if (resInfo.Protocol == rsh) - command = "rcp "; - else if (resInfo.Protocol == ssh) - command = "scp "; - else - throw SALOME_Exception("Unknown protocol"); - - command += _TmpFileName; - command += " "; - if (resInfo.UserName != ""){ - command += resInfo.UserName; - command += "@"; - } - command += resInfo.Alias; - command += ":"; - command += DirForTmpFiles ; - command += "/runSalome_" ; - command += FileNameToExecute ; - command += "_Batch.sh" ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - RmTmpFile(); - - END_OF("SALOME_ResourcesManager::BuildCmdrunSalomeBatch"); - return command; -} - -//============================================================================= -/*! - * builds the shell to create for runSalome Batch on a Cluster : - * #! /bin/sh -f - * #BSUB -n NumberOfProcessors - * #BSUB -o runSalome.log%J - * mpirun -srun /$HOME/Batch/date_hh_mn_ss/runSalome_'FileNameToExecute'_Batch.sh - * with NumberOfProcessors from params - * with FileNameToExecute as python script to execute - */ +/*! CORBA Method: + * Submit a batch job on a cluster and returns the JobId + * \param fileToExecute : .py/.exe/.sh/... to execute on the batch cluster + * \param filesToExport : to export on the batch cluster + * \param NumberOfProcessors : Number of processors needed on the batch cluster + * \param params : Constraints for the choice of the batch cluster + */ //============================================================================= -std::string SALOME_ResourcesManager::BuildCmdFileNameToExecute_Batch( - const std::string& machine, - const long NumberOfProcessors, - const std::string& DirForTmpFiles , - const std::string& FileNameToExecute ) throw(SALOME_Exception) +CORBA::Long SALOME_ResourcesManager::batchSalomeJob( + const char * fileToExecute , + const Engines::FilesToExportList& filesToExport , + const CORBA::Long NumberOfProcessors , + const Engines::MachineParameters& params) { - BEGIN_OF("SALOME_ResourcesManager::BuildCmdFileNameToExecute_Batch"); - int status; - _TmpFileName = BuildTemporaryFileName(); - ofstream tempOutputFile; - tempOutputFile.open(_TmpFileName.c_str(), ofstream::out ); - const ParserResourcesType& resInfo = _resourcesList[machine]; - resInfo.Print() ; - tempOutputFile << "#! /bin/sh -f" << endl ; - tempOutputFile << "#BSUB -n " ; - tempOutputFile << NumberOfProcessors << endl ; - tempOutputFile << "#BSUB -o runSalome.log%J" << endl ; - tempOutputFile << "mpirun -srun ~/" ; - tempOutputFile << DirForTmpFiles ; - tempOutputFile << "/runSalome_" ; - tempOutputFile << FileNameToExecute ; - tempOutputFile << "_Batch.sh" << endl ; - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(_TmpFileName.c_str(), 0x1ED); - SCRUTE(_TmpFileName.c_str()) ; - - string command; - if (resInfo.Protocol == rsh) - command = "rcp "; - else if (resInfo.Protocol == ssh) - command = "scp "; - else - throw SALOME_Exception("Unknown protocol"); - command += _TmpFileName; - command += " "; - if (resInfo.UserName != ""){ - command += resInfo.UserName; - command += "@"; - } - command += resInfo.Alias; - command += ":"; - command += DirForTmpFiles ; - command += "/" ; - command += FileNameToExecute ; - command += "_Batch.sh" ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - RmTmpFile(); - END_OF("SALOME_ResourcesManager::BuildCmdFileNameToExecute_Batch"); - - return command; -} - -//============================================================================= -/*! - * builds the shell to create for runSalome Batch on a Cluster : - * #! /bin/sh -f - * bsub < /$HOME/Batch/date_hh_mn_ss/'FileNameToExecute'_Batch.sh & - * with preReqFilePath of CatalogResource for Salome environnement - * with ListOfModules of CatalogResource - * with FileNameToExecute as python script to execute - */ -//============================================================================= -std::string SALOME_ResourcesManager::BuildCmdFileNameToExecute_bsub( - const std::string& machine, - const std::string& DirForTmpFiles , - const std::string& FileNameToExecute ) throw(SALOME_Exception) -{ - BEGIN_OF("SALOME_ResourcesManager::BuildCmdFileNameToExecute_bsub"); - _TmpFileName = BuildTemporaryFileName(); - int status; - ofstream tempOutputFile; - tempOutputFile.open(_TmpFileName.c_str(), ofstream::out ); - const ParserResourcesType& resInfo = _resourcesList[machine]; - resInfo.Print() ; - tempOutputFile << "#! /bin/sh -f" << endl ; - tempOutputFile << "bsub < ~/" ; - tempOutputFile << DirForTmpFiles ; - tempOutputFile << "/" ; - tempOutputFile << FileNameToExecute ; - tempOutputFile << "_Batch.sh &" << endl ; - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(_TmpFileName.c_str(), 0x1ED); - SCRUTE(_TmpFileName.c_str()) ; - - string command; - if (resInfo.Protocol == rsh) - command = "rcp "; - else if (resInfo.Protocol == ssh) - command = "scp "; + BEGIN_OF("SALOME_ResourcesManager::batchSalomeJob"); + Engines::CompoList aCompoList ; + vector aMachineList = GetFittingResources( params , aCompoList ) ; + const ParserResourcesType& resInfo = _resourcesList[aMachineList[0]]; + + BatchLight::batchParams p; + p.hostname = resInfo.Alias; + if( resInfo.Protocol == rsh ) + p.protocol = "rsh"; + else if( resInfo.Protocol == ssh ) + p.protocol = "ssh"; else throw SALOME_Exception("Unknown protocol"); - command += _TmpFileName; - command += " "; - if (resInfo.UserName != ""){ - command += resInfo.UserName; - command += "@"; + p.username = resInfo.UserName; + p.applipath = resInfo.AppliPath; + p.modulesList = resInfo.ModulesList; + + try{ + BatchLight::Job job = BatchLight::Job( fileToExecute, filesToExport, NumberOfProcessors ); + BatchLight::BatchManager_SLURM bms = BatchLight::BatchManager_SLURM(p); + bms.submitJob(job); } - command += resInfo.Alias; - command += ":"; - command += DirForTmpFiles ; - command += "/" ; - command += FileNameToExecute ; - command += "_bsub.sh" ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - RmTmpFile(); - END_OF("SALOME_ResourcesManager::BuildCmdFileNameToExecute_bsub"); - - return command; -} - -//============================================================================= -/*! - * builds the rsh/ssh command for submitting of a batch job : - * ssh tantale.ccc.cea.fr /$HOME/Batch/date_hh_mn_ss/'FileNameToExecute'_bsub.sh - * with FileNameToExecute as python script to execute - */ -//============================================================================= -std::string SALOME_ResourcesManager::CmdToExecute_bsub( - const std::string& machine, - const std::string& DirForTmpFiles , - const std::string& FileNameToExecute ) throw(SALOME_Exception) -{ - BEGIN_OF("SALOME_ResourcesManager::CmdToExecute_bsub"); - const ParserResourcesType& resInfo = _resourcesList[machine]; - string command; - resInfo.Print(); - int status; - - if (resInfo.Protocol == rsh) - command = "rsh " ; - else if (resInfo.Protocol == ssh) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); - if (resInfo.UserName != ""){ - command += resInfo.UserName; - command += "@"; + catch(const SALOME_Exception &ex){ + MESSAGE(ex.what()); } - command += resInfo.Alias; - command += " \"tcsh " ; - command += DirForTmpFiles ; - command += "/" ; - command += FileNameToExecute ; - command += "_bsub.sh\"" ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - END_OF("SALOME_ResourcesManager::CmdToExecute_bsub"); - - return command; + END_OF("SALOME_ResourcesManager::batchSalomeJob"); } //============================================================================= diff --git a/src/ResourcesManager/SALOME_ResourcesManager.hxx b/src/ResourcesManager/SALOME_ResourcesManager.hxx index 952e40092..44cb6deac 100644 --- a/src/ResourcesManager/SALOME_ResourcesManager.hxx +++ b/src/ResourcesManager/SALOME_ResourcesManager.hxx @@ -76,23 +76,10 @@ class RESOURCESMANAGER_EXPORT SALOME_ResourcesManager (const std::string& machine, const Engines::MachineParameters& params, const long id); - void CopyFileNamesToExecute(const std::string& machine, - const std::string& DirForTmpFiles , - const std::string& PathFileNameToExecute , - const Engines::FilesToExportList& filesToExport) throw(SALOME_Exception); - std::string BuildCmdrunSalomeBatch(const std::string& machine, - const std::string& DirForTmpFiles , - const std::string& FileNameToExecute ) throw(SALOME_Exception); - std::string BuildCmdFileNameToExecute_Batch(const std::string& machine, - const long NumberOfProcessors, - const std::string& DirForTmpFiles , - const std::string& FileNameToExecute ) throw(SALOME_Exception); - std::string BuildCmdFileNameToExecute_bsub(const std::string& machine, - const std::string& DirForTmpFiles , - const std::string& FileNameToExecute ) throw(SALOME_Exception); - std::string CmdToExecute_bsub(const std::string& machine, - const std::string& DirForTmpFiles , - const std::string& FileNameToExecute ) throw(SALOME_Exception); + CORBA::Long batchSalomeJob(const char * fileToExecute , + const Engines::FilesToExportList& filesToExport , + const CORBA::Long NumberOfProcessors , + const Engines::MachineParameters& params); std::string BuildCommandToLaunchLocalContainer (const Engines::MachineParameters& params, const long id); -- 2.39.2